howard.objects.variants
1import csv 2import gc 3import gzip 4import io 5import multiprocessing as mp 6import os 7import random 8import re 9import shlex 10import sqlite3 11import subprocess 12from tempfile import NamedTemporaryFile, TemporaryDirectory 13import tempfile 14import duckdb 15import json 16import yaml 17import argparse 18import Bio.bgzf as bgzf 19import pandas as pd 20from pyfaidx import Fasta 21import numpy as np 22import vcf 23import logging as log 24import fastparquet as fp 25from multiprocesspandas import applyparallel 26import cyvcf2 27import pyBigWig 28import math 29 30from howard.functions.commons import * 31from howard.objects.database import * 32from howard.functions.databases import * 33from howard.functions.utils import * 34 35 36class Variants: 37 38 def __init__( 39 self, 40 conn=None, 41 input: str = None, 42 output: str = None, 43 config: dict = {}, 44 param: dict = {}, 45 load: bool = False, 46 ) -> None: 47 """ 48 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 49 header 50 51 :param conn: the connection to the database 52 :param input: the input file 53 :param output: the output file 54 :param config: a dictionary containing the configuration of the model 55 :param param: a dictionary containing the parameters of the model 56 """ 57 58 # Init variables 59 self.init_variables() 60 61 # Input 62 self.set_input(input) 63 64 # Config 65 self.set_config(config) 66 67 # Param 68 self.set_param(param) 69 70 # Output 71 self.set_output(output) 72 73 # connexion 74 self.set_connexion(conn) 75 76 # Header 77 self.set_header() 78 79 # Samples 80 self.set_samples() 81 82 # Load data 83 if load: 84 self.load_data() 85 86 def set_samples(self, samples: list = None) -> list: 87 """ 88 The function `set_samples` sets the samples attribute of an object to a provided list or 89 retrieves it from a parameter dictionary. 90 91 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 92 input and sets the `samples` attribute of the class to the provided list. If no samples are 93 provided, it tries to get the samples from the class's parameters using the `get_param` method 94 :type samples: list 95 :return: The `samples` list is being returned. 96 """ 97 98 if not samples: 99 samples = self.get_param().get("samples", {}).get("list", None) 100 101 self.samples = samples 102 103 return samples 104 105 def get_samples(self) -> list: 106 """ 107 This function returns a list of samples. 108 :return: The `get_samples` method is returning the `samples` attribute of the object. 109 """ 110 111 return self.samples 112 113 def get_samples_check(self) -> bool: 114 """ 115 This function returns the value of the "check" key within the "samples" dictionary retrieved 116 from the parameters. 117 :return: The method `get_samples_check` is returning the value of the key "check" inside the 118 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 119 method. If the key "check" is not found, it will return `False`. 120 """ 121 122 return self.get_param().get("samples", {}).get("check", True) 123 124 def set_input(self, input: str = None) -> None: 125 """ 126 The function `set_input` takes a file name as input, extracts the name and extension, and sets 127 attributes in the class accordingly. 128 129 :param input: The `set_input` method in the provided code snippet is used to set attributes 130 related to the input file. Here's a breakdown of the parameters and their usage in the method: 131 :type input: str 132 """ 133 134 if input and not isinstance(input, str): 135 try: 136 self.input = input.name 137 except: 138 log.error(f"Input file '{input} in bad format") 139 raise ValueError(f"Input file '{input} in bad format") 140 else: 141 self.input = input 142 143 # Input format 144 if input: 145 input_name, input_extension = os.path.splitext(self.input) 146 self.input_name = input_name 147 self.input_extension = input_extension 148 self.input_format = self.input_extension.replace(".", "") 149 150 def set_config(self, config: dict) -> None: 151 """ 152 The set_config function takes a config object and assigns it as the configuration object for the 153 class. 154 155 :param config: The `config` parameter in the `set_config` function is a dictionary object that 156 contains configuration settings for the class. When you call the `set_config` function with a 157 dictionary object as the argument, it will set that dictionary as the configuration object for 158 the class 159 :type config: dict 160 """ 161 162 self.config = config 163 164 def set_param(self, param: dict) -> None: 165 """ 166 This function sets a parameter object for the class based on the input dictionary. 167 168 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 169 as the `param` attribute of the class instance 170 :type param: dict 171 """ 172 173 self.param = param 174 175 def init_variables(self) -> None: 176 """ 177 This function initializes the variables that will be used in the rest of the class 178 """ 179 180 self.prefix = "howard" 181 self.table_variants = "variants" 182 self.dataframe = None 183 184 self.comparison_map = { 185 "gt": ">", 186 "gte": ">=", 187 "lt": "<", 188 "lte": "<=", 189 "equals": "=", 190 "contains": "SIMILAR TO", 191 } 192 193 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 194 195 self.code_type_map_to_sql = { 196 "Integer": "INTEGER", 197 "String": "VARCHAR", 198 "Float": "FLOAT", 199 "Flag": "VARCHAR", 200 } 201 202 self.index_additionnal_fields = [] 203 204 def get_indexing(self) -> bool: 205 """ 206 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 207 returns False. 208 :return: The value of the indexing parameter. 209 """ 210 211 return self.get_param().get("indexing", False) 212 213 def get_connexion_config(self) -> dict: 214 """ 215 The function `get_connexion_config` returns a dictionary containing the configuration for a 216 connection, including the number of threads and memory limit. 217 :return: a dictionary containing the configuration for the Connexion library. 218 """ 219 220 # config 221 config = self.get_config() 222 223 # Connexion config 224 connexion_config = {} 225 threads = self.get_threads() 226 227 # Threads 228 if threads: 229 connexion_config["threads"] = threads 230 231 # Memory 232 # if config.get("memory", None): 233 # connexion_config["memory_limit"] = config.get("memory") 234 if self.get_memory(): 235 connexion_config["memory_limit"] = self.get_memory() 236 237 # Temporary directory 238 if config.get("tmp", None): 239 connexion_config["temp_directory"] = config.get("tmp") 240 241 # Access 242 if config.get("access", None): 243 access = config.get("access") 244 if access in ["RO"]: 245 access = "READ_ONLY" 246 elif access in ["RW"]: 247 access = "READ_WRITE" 248 connexion_db = self.get_connexion_db() 249 if connexion_db in ":memory:": 250 access = "READ_WRITE" 251 connexion_config["access_mode"] = access 252 253 return connexion_config 254 255 def get_duckdb_settings(self) -> dict: 256 """ 257 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 258 string. 259 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 260 """ 261 262 # config 263 config = self.get_config() 264 265 # duckdb settings 266 duckdb_settings_dict = {} 267 if config.get("duckdb_settings", None): 268 duckdb_settings = config.get("duckdb_settings") 269 duckdb_settings = full_path(duckdb_settings) 270 # duckdb setting is a file 271 if os.path.exists(duckdb_settings): 272 with open(duckdb_settings) as json_file: 273 duckdb_settings_dict = yaml.safe_load(json_file) 274 # duckdb settings is a string 275 else: 276 duckdb_settings_dict = json.loads(duckdb_settings) 277 278 return duckdb_settings_dict 279 280 def set_connexion_db(self) -> str: 281 """ 282 The function `set_connexion_db` returns the appropriate database connection string based on the 283 input format and connection type. 284 :return: the value of the variable `connexion_db`. 285 """ 286 287 # Default connexion db 288 default_connexion_db = ":memory:" 289 290 # Find connexion db 291 if self.get_input_format() in ["db", "duckdb"]: 292 connexion_db = self.get_input() 293 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 294 connexion_db = default_connexion_db 295 elif self.get_connexion_type() in ["tmpfile"]: 296 tmp_name = tempfile.mkdtemp( 297 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 298 ) 299 connexion_db = f"{tmp_name}/tmp.db" 300 elif self.get_connexion_type() != "": 301 connexion_db = self.get_connexion_type() 302 else: 303 connexion_db = default_connexion_db 304 305 # Set connexion db 306 self.connexion_db = connexion_db 307 308 return connexion_db 309 310 def set_connexion(self, conn) -> None: 311 """ 312 The function `set_connexion` creates a connection to a database, with options for different 313 database formats and settings. 314 315 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 316 database. If a connection is not provided, a new connection to an in-memory database is created. 317 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 318 sqlite 319 """ 320 321 # Connexion db 322 connexion_db = self.set_connexion_db() 323 324 # Connexion config 325 connexion_config = self.get_connexion_config() 326 327 # Connexion format 328 connexion_format = self.get_config().get("connexion_format", "duckdb") 329 # Set connexion format 330 self.connexion_format = connexion_format 331 332 # Connexion 333 if not conn: 334 if connexion_format in ["duckdb"]: 335 conn = duckdb.connect(connexion_db, config=connexion_config) 336 # duckDB settings 337 duckdb_settings = self.get_duckdb_settings() 338 if duckdb_settings: 339 for setting in duckdb_settings: 340 setting_value = duckdb_settings.get(setting) 341 if isinstance(setting_value, str): 342 setting_value = f"'{setting_value}'" 343 conn.execute(f"PRAGMA {setting}={setting_value};") 344 elif connexion_format in ["sqlite"]: 345 conn = sqlite3.connect(connexion_db) 346 347 # Set connexion 348 self.conn = conn 349 350 # Log 351 log.debug(f"connexion_format: {connexion_format}") 352 log.debug(f"connexion_db: {connexion_db}") 353 log.debug(f"connexion config: {connexion_config}") 354 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 355 356 def set_output(self, output: str = None) -> None: 357 """ 358 The `set_output` function in Python sets the output file based on the input or a specified key 359 in the config file, extracting the output name, extension, and format. 360 361 :param output: The `output` parameter in the `set_output` method is used to specify the name of 362 the output file. If the config file has an 'output' key, the method sets the output to the value 363 of that key. If no output is provided, it sets the output to `None` 364 :type output: str 365 """ 366 367 if output and not isinstance(output, str): 368 self.output = output.name 369 else: 370 self.output = output 371 372 # Output format 373 if self.output: 374 output_name, output_extension = os.path.splitext(self.output) 375 self.output_name = output_name 376 self.output_extension = output_extension 377 self.output_format = self.output_extension.replace(".", "") 378 else: 379 self.output_name = None 380 self.output_extension = None 381 self.output_format = None 382 383 def set_header(self) -> None: 384 """ 385 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 386 """ 387 388 input_file = self.get_input() 389 default_header_list = [ 390 "##fileformat=VCFv4.2", 391 "#CHROM POS ID REF ALT QUAL FILTER INFO", 392 ] 393 394 # Full path 395 input_file = full_path(input_file) 396 397 if input_file: 398 399 input_format = self.get_input_format() 400 input_compressed = self.get_input_compressed() 401 config = self.get_config() 402 header_list = default_header_list 403 if input_format in [ 404 "vcf", 405 "hdr", 406 "tsv", 407 "csv", 408 "psv", 409 "parquet", 410 "db", 411 "duckdb", 412 ]: 413 # header provided in param 414 if config.get("header_file", None): 415 with open(config.get("header_file"), "rt") as f: 416 header_list = self.read_vcf_header(f) 417 # within a vcf file format (header within input file itsself) 418 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 419 # within a compressed vcf file format (.vcf.gz) 420 if input_compressed: 421 with bgzf.open(input_file, "rt") as f: 422 header_list = self.read_vcf_header(f) 423 # within an uncompressed vcf file format (.vcf) 424 else: 425 with open(input_file, "rt") as f: 426 header_list = self.read_vcf_header(f) 427 # header provided in default external file .hdr 428 elif os.path.exists((input_file + ".hdr")): 429 with open(input_file + ".hdr", "rt") as f: 430 header_list = self.read_vcf_header(f) 431 else: 432 try: # Try to get header info fields and file columns 433 434 with tempfile.TemporaryDirectory() as tmpdir: 435 436 # Create database 437 db_for_header = Database(database=input_file) 438 439 # Get header columns for infos fields 440 db_header_from_columns = ( 441 db_for_header.get_header_from_columns() 442 ) 443 444 # Get real columns in the file 445 db_header_columns = db_for_header.get_columns() 446 447 # Write header file 448 header_file_tmp = os.path.join(tmpdir, "header") 449 f = open(header_file_tmp, "w") 450 vcf.Writer(f, db_header_from_columns) 451 f.close() 452 453 # Replace #CHROM line with rel columns 454 header_list = db_for_header.read_header_file( 455 header_file=header_file_tmp 456 ) 457 header_list[-1] = "\t".join(db_header_columns) 458 459 except: 460 461 log.warning( 462 f"No header for file {input_file}. Set as default VCF header" 463 ) 464 header_list = default_header_list 465 466 else: # try for unknown format ? 467 468 log.error(f"Input file format '{input_format}' not available") 469 raise ValueError(f"Input file format '{input_format}' not available") 470 471 if not header_list: 472 header_list = default_header_list 473 474 # header as list 475 self.header_list = header_list 476 477 # header as VCF object 478 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 479 480 else: 481 482 self.header_list = None 483 self.header_vcf = None 484 485 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 486 """ 487 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 488 DataFrame based on the connection format. 489 490 :param query: The `query` parameter in the `get_query_to_df` function is a string that 491 represents the SQL query you want to execute. This query will be used to fetch data from a 492 database and convert it into a pandas DataFrame 493 :type query: str 494 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 495 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 496 function will only fetch up to that number of rows from the database query result. If no limit 497 is specified, 498 :type limit: int 499 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 500 """ 501 502 # Connexion format 503 connexion_format = self.get_connexion_format() 504 505 # Limit in query 506 if limit: 507 pd.set_option("display.max_rows", limit) 508 if connexion_format in ["duckdb"]: 509 df = ( 510 self.conn.execute(query) 511 .fetch_record_batch(limit) 512 .read_next_batch() 513 .to_pandas() 514 ) 515 elif connexion_format in ["sqlite"]: 516 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 517 518 # Full query 519 else: 520 if connexion_format in ["duckdb"]: 521 df = self.conn.execute(query).df() 522 elif connexion_format in ["sqlite"]: 523 df = pd.read_sql_query(query, self.conn) 524 525 return df 526 527 def get_overview(self) -> None: 528 """ 529 The function prints the input, output, config, and dataframe of the current object 530 """ 531 table_variants_from = self.get_table_variants(clause="from") 532 sql_columns = self.get_header_columns_as_sql() 533 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 534 df = self.get_query_to_df(sql_query_export) 535 log.info( 536 "Input: " 537 + str(self.get_input()) 538 + " [" 539 + str(str(self.get_input_format())) 540 + "]" 541 ) 542 log.info( 543 "Output: " 544 + str(self.get_output()) 545 + " [" 546 + str(str(self.get_output_format())) 547 + "]" 548 ) 549 log.info("Config: ") 550 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 551 "\n" 552 ): 553 log.info("\t" + str(d)) 554 log.info("Param: ") 555 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 556 "\n" 557 ): 558 log.info("\t" + str(d)) 559 log.info("Sample list: " + str(self.get_header_sample_list())) 560 log.info("Dataframe: ") 561 for d in str(df).split("\n"): 562 log.info("\t" + str(d)) 563 564 # garbage collector 565 del df 566 gc.collect() 567 568 return None 569 570 def get_stats(self) -> dict: 571 """ 572 The `get_stats` function calculates and returns various statistics of the current object, 573 including information about the input file, variants, samples, header fields, quality, and 574 SNVs/InDels. 575 :return: a dictionary containing various statistics of the current object. The dictionary has 576 the following structure: 577 """ 578 579 # Log 580 log.info(f"Stats Calculation...") 581 582 # table varaints 583 table_variants_from = self.get_table_variants() 584 585 # stats dict 586 stats = {"Infos": {}} 587 588 ### File 589 input_file = self.get_input() 590 stats["Infos"]["Input file"] = input_file 591 592 # Header 593 header_infos = self.get_header().infos 594 header_formats = self.get_header().formats 595 header_infos_list = list(header_infos) 596 header_formats_list = list(header_formats) 597 598 ### Variants 599 600 stats["Variants"] = {} 601 602 # Variants by chr 603 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 604 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 605 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 606 by=["CHROM"], kind="quicksort" 607 ) 608 609 # Total number of variants 610 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 611 612 # Calculate percentage 613 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 614 lambda x: (x / nb_of_variants) 615 ) 616 617 stats["Variants"]["Number of variants by chromosome"] = ( 618 nb_of_variants_by_chrom.to_dict(orient="index") 619 ) 620 621 stats["Infos"]["Number of variants"] = int(nb_of_variants) 622 623 ### Samples 624 625 # Init 626 samples = {} 627 nb_of_samples = 0 628 629 # Check Samples 630 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 631 log.debug(f"Check samples...") 632 for sample in self.get_header_sample_list(): 633 sql_query_samples = f""" 634 SELECT '{sample}' as sample, 635 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 636 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 637 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 638 FROM {table_variants_from} 639 WHERE ( 640 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 641 AND 642 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 643 ) 644 GROUP BY genotype 645 """ 646 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 647 sample_genotype_count = sql_query_genotype_df["count"].sum() 648 if len(sql_query_genotype_df): 649 nb_of_samples += 1 650 samples[f"{sample} - {sample_genotype_count} variants"] = ( 651 sql_query_genotype_df.to_dict(orient="index") 652 ) 653 654 stats["Samples"] = samples 655 stats["Infos"]["Number of samples"] = nb_of_samples 656 657 # # 658 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 659 # stats["Infos"]["Number of samples"] = nb_of_samples 660 # elif nb_of_samples: 661 # stats["Infos"]["Number of samples"] = "not a VCF format" 662 663 ### INFO and FORMAT fields 664 header_types_df = {} 665 header_types_list = { 666 "List of INFO fields": header_infos, 667 "List of FORMAT fields": header_formats, 668 } 669 i = 0 670 for header_type in header_types_list: 671 672 header_type_infos = header_types_list.get(header_type) 673 header_infos_dict = {} 674 675 for info in header_type_infos: 676 677 i += 1 678 header_infos_dict[i] = {} 679 680 # ID 681 header_infos_dict[i]["id"] = info 682 683 # num 684 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 685 if header_type_infos[info].num in genotype_map.keys(): 686 header_infos_dict[i]["Number"] = genotype_map.get( 687 header_type_infos[info].num 688 ) 689 else: 690 header_infos_dict[i]["Number"] = header_type_infos[info].num 691 692 # type 693 if header_type_infos[info].type: 694 header_infos_dict[i]["Type"] = header_type_infos[info].type 695 else: 696 header_infos_dict[i]["Type"] = "." 697 698 # desc 699 if header_type_infos[info].desc != None: 700 header_infos_dict[i]["Description"] = header_type_infos[info].desc 701 else: 702 header_infos_dict[i]["Description"] = "" 703 704 if len(header_infos_dict): 705 header_types_df[header_type] = pd.DataFrame.from_dict( 706 header_infos_dict, orient="index" 707 ).to_dict(orient="index") 708 709 # Stats 710 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 711 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 712 stats["Header"] = header_types_df 713 714 ### QUAL 715 if "QUAL" in self.get_header_columns(): 716 sql_query_qual = f""" 717 SELECT 718 avg(CAST(QUAL AS INTEGER)) AS Average, 719 min(CAST(QUAL AS INTEGER)) AS Minimum, 720 max(CAST(QUAL AS INTEGER)) AS Maximum, 721 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 722 median(CAST(QUAL AS INTEGER)) AS Median, 723 variance(CAST(QUAL AS INTEGER)) AS Variance 724 FROM {table_variants_from} 725 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 726 """ 727 728 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 729 stats["Quality"] = {"Stats": qual} 730 731 ### SNV and InDel 732 733 sql_query_snv = f""" 734 735 SELECT Type, count FROM ( 736 737 SELECT 738 'Total' AS Type, 739 count(*) AS count 740 FROM {table_variants_from} 741 742 UNION 743 744 SELECT 745 'MNV' AS Type, 746 count(*) AS count 747 FROM {table_variants_from} 748 WHERE len(REF) > 1 AND len(ALT) > 1 749 AND len(REF) = len(ALT) 750 751 UNION 752 753 SELECT 754 'InDel' AS Type, 755 count(*) AS count 756 FROM {table_variants_from} 757 WHERE len(REF) > 1 OR len(ALT) > 1 758 AND len(REF) != len(ALT) 759 760 UNION 761 762 SELECT 763 'SNV' AS Type, 764 count(*) AS count 765 FROM {table_variants_from} 766 WHERE len(REF) = 1 AND len(ALT) = 1 767 768 ) 769 770 ORDER BY count DESC 771 772 """ 773 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 774 775 sql_query_snv_substitution = f""" 776 SELECT 777 concat(REF, '>', ALT) AS 'Substitution', 778 count(*) AS count 779 FROM {table_variants_from} 780 WHERE len(REF) = 1 AND len(ALT) = 1 781 GROUP BY REF, ALT 782 ORDER BY count(*) DESC 783 """ 784 snv_substitution = ( 785 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 786 ) 787 stats["Variants"]["Counts"] = snv_indel 788 stats["Variants"]["Substitutions"] = snv_substitution 789 790 return stats 791 792 def stats_to_file(self, file: str = None) -> str: 793 """ 794 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 795 into a JSON object, and writes the JSON object to the specified file. 796 797 :param file: The `file` parameter is a string that represents the file path where the JSON data 798 will be written 799 :type file: str 800 :return: the name of the file that was written to. 801 """ 802 803 # Get stats 804 stats = self.get_stats() 805 806 # Serializing json 807 json_object = json.dumps(stats, indent=4) 808 809 # Writing to sample.json 810 with open(file, "w") as outfile: 811 outfile.write(json_object) 812 813 return file 814 815 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 816 """ 817 The `print_stats` function generates a markdown file and prints the statistics contained in a 818 JSON file in a formatted manner. 819 820 :param output_file: The `output_file` parameter is a string that specifies the path and filename 821 of the output file where the stats will be printed in Markdown format. If no `output_file` is 822 provided, a temporary directory will be created and the stats will be saved in a file named 823 "stats.md" within that 824 :type output_file: str 825 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 826 file where the statistics will be saved. If no value is provided, a temporary directory will be 827 created and a default file name "stats.json" will be used 828 :type json_file: str 829 :return: The function `print_stats` does not return any value. It has a return type annotation 830 of `None`. 831 """ 832 833 # Full path 834 output_file = full_path(output_file) 835 json_file = full_path(json_file) 836 837 with tempfile.TemporaryDirectory() as tmpdir: 838 839 # Files 840 if not output_file: 841 output_file = os.path.join(tmpdir, "stats.md") 842 if not json_file: 843 json_file = os.path.join(tmpdir, "stats.json") 844 845 # Create folders 846 if not os.path.exists(os.path.dirname(output_file)): 847 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 848 if not os.path.exists(os.path.dirname(json_file)): 849 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 850 851 # Create stats JSON file 852 stats_file = self.stats_to_file(file=json_file) 853 854 # Print stats file 855 with open(stats_file) as f: 856 stats = yaml.safe_load(f) 857 858 # Output 859 output_title = [] 860 output_index = [] 861 output = [] 862 863 # Title 864 output_title.append("# HOWARD Stats") 865 866 # Index 867 output_index.append("## Index") 868 869 # Process sections 870 for section in stats: 871 infos = stats.get(section) 872 section_link = "#" + section.lower().replace(" ", "-") 873 output.append(f"## {section}") 874 output_index.append(f"- [{section}]({section_link})") 875 876 if len(infos): 877 for info in infos: 878 try: 879 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 880 is_df = True 881 except: 882 try: 883 df = pd.DataFrame.from_dict( 884 json.loads((infos.get(info))), orient="index" 885 ) 886 is_df = True 887 except: 888 is_df = False 889 if is_df: 890 output.append(f"### {info}") 891 info_link = "#" + info.lower().replace(" ", "-") 892 output_index.append(f" - [{info}]({info_link})") 893 output.append(f"{df.to_markdown(index=False)}") 894 else: 895 output.append(f"- {info}: {infos.get(info)}") 896 else: 897 output.append(f"NA") 898 899 # Write stats in markdown file 900 with open(output_file, "w") as fp: 901 for item in output_title: 902 fp.write("%s\n" % item) 903 for item in output_index: 904 fp.write("%s\n" % item) 905 for item in output: 906 fp.write("%s\n" % item) 907 908 # Output stats in markdown 909 print("") 910 print("\n\n".join(output_title)) 911 print("") 912 print("\n\n".join(output)) 913 print("") 914 915 return None 916 917 def get_input(self) -> str: 918 """ 919 It returns the value of the input variable. 920 :return: The input is being returned. 921 """ 922 return self.input 923 924 def get_input_format(self, input_file: str = None) -> str: 925 """ 926 This function returns the format of the input variable, either from the provided input file or 927 by prompting for input. 928 929 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 930 represents the file path of the input file. If no `input_file` is provided when calling the 931 method, it will default to `None` 932 :type input_file: str 933 :return: The format of the input variable is being returned. 934 """ 935 936 if not input_file: 937 input_file = self.get_input() 938 input_format = get_file_format(input_file) 939 return input_format 940 941 def get_input_compressed(self, input_file: str = None) -> str: 942 """ 943 The function `get_input_compressed` returns the format of the input variable after compressing 944 it. 945 946 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 947 that represents the file path of the input file. If no `input_file` is provided when calling the 948 method, it will default to `None` and the method will then call `self.get_input()` to 949 :type input_file: str 950 :return: The function `get_input_compressed` returns the compressed format of the input 951 variable. 952 """ 953 954 if not input_file: 955 input_file = self.get_input() 956 input_compressed = get_file_compressed(input_file) 957 return input_compressed 958 959 def get_output(self) -> str: 960 """ 961 It returns the output of the neuron. 962 :return: The output of the neural network. 963 """ 964 965 return self.output 966 967 def get_output_format(self, output_file: str = None) -> str: 968 """ 969 The function `get_output_format` returns the format of the input variable or the output file if 970 provided. 971 972 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 973 that represents the file path of the output file. If no `output_file` is provided when calling 974 the method, it will default to the output obtained from the `get_output` method of the class 975 instance. The 976 :type output_file: str 977 :return: The format of the input variable is being returned. 978 """ 979 980 if not output_file: 981 output_file = self.get_output() 982 output_format = get_file_format(output_file) 983 984 return output_format 985 986 def get_config(self) -> dict: 987 """ 988 It returns the config 989 :return: The config variable is being returned. 990 """ 991 return self.config 992 993 def get_param(self) -> dict: 994 """ 995 It returns the param 996 :return: The param variable is being returned. 997 """ 998 return self.param 999 1000 def get_connexion_db(self) -> str: 1001 """ 1002 It returns the connexion_db attribute of the object 1003 :return: The connexion_db is being returned. 1004 """ 1005 return self.connexion_db 1006 1007 def get_prefix(self) -> str: 1008 """ 1009 It returns the prefix of the object. 1010 :return: The prefix is being returned. 1011 """ 1012 return self.prefix 1013 1014 def get_table_variants(self, clause: str = "select") -> str: 1015 """ 1016 This function returns the table_variants attribute of the object 1017 1018 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1019 defaults to select (optional) 1020 :return: The table_variants attribute of the object. 1021 """ 1022 1023 # Access 1024 access = self.get_config().get("access", None) 1025 1026 # Clauses "select", "where", "update" 1027 if clause in ["select", "where", "update"]: 1028 table_variants = self.table_variants 1029 # Clause "from" 1030 elif clause in ["from"]: 1031 # For Read Only 1032 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1033 input_file = self.get_input() 1034 table_variants = f"'{input_file}' as variants" 1035 # For Read Write 1036 else: 1037 table_variants = f"{self.table_variants} as variants" 1038 else: 1039 table_variants = self.table_variants 1040 return table_variants 1041 1042 def get_tmp_dir(self) -> str: 1043 """ 1044 The function `get_tmp_dir` returns the temporary directory path based on configuration 1045 parameters or a default path. 1046 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1047 configuration, parameters, and a default value of "/tmp". 1048 """ 1049 1050 return get_tmp( 1051 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1052 ) 1053 1054 def get_connexion_type(self) -> str: 1055 """ 1056 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1057 1058 :return: The connexion type is being returned. 1059 """ 1060 return self.get_config().get("connexion_type", "memory") 1061 1062 def get_connexion(self): 1063 """ 1064 It returns the connection object 1065 1066 :return: The connection object. 1067 """ 1068 return self.conn 1069 1070 def close_connexion(self) -> None: 1071 """ 1072 This function closes the connection to the database. 1073 :return: The connection is being closed. 1074 """ 1075 return self.conn.close() 1076 1077 def get_header(self, type: str = "vcf"): 1078 """ 1079 This function returns the header of the VCF file as a list of strings 1080 1081 :param type: the type of header you want to get, defaults to vcf (optional) 1082 :return: The header of the vcf file. 1083 """ 1084 1085 if self.header_vcf: 1086 if type == "vcf": 1087 return self.header_vcf 1088 elif type == "list": 1089 return self.header_list 1090 else: 1091 if type == "vcf": 1092 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1093 return header 1094 elif type == "list": 1095 return vcf_required 1096 1097 def get_header_infos_list(self) -> list: 1098 """ 1099 This function retrieves a list of information fields from the header. 1100 :return: A list of information fields from the header. 1101 """ 1102 1103 # Init 1104 infos_list = [] 1105 1106 for field in self.get_header().infos: 1107 infos_list.append(field) 1108 1109 return infos_list 1110 1111 def get_header_length(self, file: str = None) -> int: 1112 """ 1113 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1114 line. 1115 1116 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1117 header file. If this argument is provided, the function will read the header from the specified 1118 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1119 :type file: str 1120 :return: the length of the header list, excluding the #CHROM line. 1121 """ 1122 1123 if file: 1124 return len(self.read_vcf_header_file(file=file)) - 1 1125 elif self.get_header(type="list"): 1126 return len(self.get_header(type="list")) - 1 1127 else: 1128 return 0 1129 1130 def get_header_columns(self) -> str: 1131 """ 1132 This function returns the header list of a VCF 1133 1134 :return: The length of the header list. 1135 """ 1136 if self.get_header(): 1137 return self.get_header(type="list")[-1] 1138 else: 1139 return "" 1140 1141 def get_header_columns_as_list(self) -> list: 1142 """ 1143 This function returns the header list of a VCF 1144 1145 :return: The length of the header list. 1146 """ 1147 if self.get_header(): 1148 return self.get_header_columns().strip().split("\t") 1149 else: 1150 return [] 1151 1152 def get_header_columns_as_sql(self) -> str: 1153 """ 1154 This function retruns header length (without #CHROM line) 1155 1156 :return: The length of the header list. 1157 """ 1158 sql_column_list = [] 1159 for col in self.get_header_columns_as_list(): 1160 sql_column_list.append(f'"{col}"') 1161 return ",".join(sql_column_list) 1162 1163 def get_header_sample_list( 1164 self, check: bool = False, samples: list = None, samples_force: bool = False 1165 ) -> list: 1166 """ 1167 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1168 checking and filtering based on input parameters. 1169 1170 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1171 parameter that determines whether to check if the samples in the list are properly defined as 1172 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1173 list is defined as a, defaults to False 1174 :type check: bool (optional) 1175 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1176 allows you to specify a subset of samples from the header. If you provide a list of sample 1177 names, the function will check if each sample is defined in the header. If a sample is not found 1178 in the 1179 :type samples: list 1180 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1181 a boolean parameter that determines whether to force the function to return the sample list 1182 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1183 function will return the sample list without performing, defaults to False 1184 :type samples_force: bool (optional) 1185 :return: The function `get_header_sample_list` returns a list of samples based on the input 1186 parameters and conditions specified in the function. 1187 """ 1188 1189 # Init 1190 samples_list = [] 1191 1192 if samples is None: 1193 samples_list = self.header_vcf.samples 1194 else: 1195 samples_checked = [] 1196 for sample in samples: 1197 if sample in self.header_vcf.samples: 1198 samples_checked.append(sample) 1199 else: 1200 log.warning(f"Sample '{sample}' not defined in header") 1201 samples_list = samples_checked 1202 1203 # Force sample list without checking if is_genotype_column 1204 if samples_force: 1205 log.warning(f"Samples {samples_list} not checked if genotypes") 1206 return samples_list 1207 1208 if check: 1209 samples_checked = [] 1210 for sample in samples_list: 1211 if self.is_genotype_column(column=sample): 1212 samples_checked.append(sample) 1213 else: 1214 log.warning( 1215 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1216 ) 1217 samples_list = samples_checked 1218 1219 # Return samples list 1220 return samples_list 1221 1222 def is_genotype_column(self, column: str = None) -> bool: 1223 """ 1224 This function checks if a given column is a genotype column in a database. 1225 1226 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1227 represents the column name in a database table. This method checks if the specified column is a 1228 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1229 method of 1230 :type column: str 1231 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1232 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1233 column name and returns the result. If the `column` parameter is None, it returns False. 1234 """ 1235 1236 if column is not None: 1237 return Database(database=self.get_input()).is_genotype_column(column=column) 1238 else: 1239 return False 1240 1241 def get_verbose(self) -> bool: 1242 """ 1243 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1244 exist 1245 1246 :return: The value of the key "verbose" in the config dictionary. 1247 """ 1248 return self.get_config().get("verbose", False) 1249 1250 def get_connexion_format(self) -> str: 1251 """ 1252 It returns the connexion format of the object. 1253 :return: The connexion_format is being returned. 1254 """ 1255 connexion_format = self.connexion_format 1256 if connexion_format not in ["duckdb", "sqlite"]: 1257 log.error(f"Unknown connexion format {connexion_format}") 1258 raise ValueError(f"Unknown connexion format {connexion_format}") 1259 else: 1260 return connexion_format 1261 1262 def insert_file_to_table( 1263 self, 1264 file, 1265 columns: str, 1266 header_len: int = 0, 1267 sep: str = "\t", 1268 chunksize: int = 1000000, 1269 ) -> None: 1270 """ 1271 The function reads a file in chunks and inserts each chunk into a table based on the specified 1272 database format. 1273 1274 :param file: The `file` parameter is the file that you want to load into a table. It should be 1275 the path to the file on your system 1276 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1277 should contain the names of the columns in the table where the data will be inserted. The column 1278 names should be separated by commas within the string. For example, if you have columns named 1279 "id", "name 1280 :type columns: str 1281 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1282 the number of lines to skip at the beginning of the file before reading the actual data. This 1283 parameter allows you to skip any header information present in the file before processing the 1284 data, defaults to 0 1285 :type header_len: int (optional) 1286 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1287 separator character that is used in the file being read. In this case, the default separator is 1288 set to `\t`, which represents a tab character. You can change this parameter to a different 1289 separator character if, defaults to \t 1290 :type sep: str (optional) 1291 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1292 when processing the file in chunks. In the provided code snippet, the default value for 1293 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1294 to 1000000 1295 :type chunksize: int (optional) 1296 """ 1297 1298 # Config 1299 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1300 connexion_format = self.get_connexion_format() 1301 1302 log.debug("chunksize: " + str(chunksize)) 1303 1304 if chunksize: 1305 for chunk in pd.read_csv( 1306 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1307 ): 1308 if connexion_format in ["duckdb"]: 1309 sql_insert_into = ( 1310 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1311 ) 1312 self.conn.execute(sql_insert_into) 1313 elif connexion_format in ["sqlite"]: 1314 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1315 1316 def load_data( 1317 self, 1318 input_file: str = None, 1319 drop_variants_table: bool = False, 1320 sample_size: int = 20480, 1321 ) -> None: 1322 """ 1323 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1324 table before loading the data and specify a sample size. 1325 1326 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1327 table 1328 :type input_file: str 1329 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1330 determines whether the variants table should be dropped before loading the data. If set to 1331 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1332 not be dropped, defaults to False 1333 :type drop_variants_table: bool (optional) 1334 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1335 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1336 20480 1337 :type sample_size: int (optional) 1338 """ 1339 1340 log.info("Loading...") 1341 1342 # change input file 1343 if input_file: 1344 self.set_input(input_file) 1345 self.set_header() 1346 1347 # drop variants table 1348 if drop_variants_table: 1349 self.drop_variants_table() 1350 1351 # get table variants 1352 table_variants = self.get_table_variants() 1353 1354 # Access 1355 access = self.get_config().get("access", None) 1356 log.debug(f"access: {access}") 1357 1358 # Input format and compress 1359 input_format = self.get_input_format() 1360 input_compressed = self.get_input_compressed() 1361 log.debug(f"input_format: {input_format}") 1362 log.debug(f"input_compressed: {input_compressed}") 1363 1364 # input_compressed_format 1365 if input_compressed: 1366 input_compressed_format = "gzip" 1367 else: 1368 input_compressed_format = "none" 1369 log.debug(f"input_compressed_format: {input_compressed_format}") 1370 1371 # Connexion format 1372 connexion_format = self.get_connexion_format() 1373 1374 # Sample size 1375 if not sample_size: 1376 sample_size = -1 1377 log.debug(f"sample_size: {sample_size}") 1378 1379 # Load data 1380 log.debug(f"Load Data from {input_format}") 1381 1382 # DuckDB connexion 1383 if connexion_format in ["duckdb"]: 1384 1385 # Database already exists 1386 if self.input_format in ["db", "duckdb"]: 1387 1388 if connexion_format in ["duckdb"]: 1389 log.debug(f"Input file format '{self.input_format}' duckDB") 1390 else: 1391 log.error( 1392 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1393 ) 1394 raise ValueError( 1395 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1396 ) 1397 1398 # Load from existing database format 1399 else: 1400 1401 try: 1402 # Create Table or View 1403 database = Database(database=self.input) 1404 sql_from = database.get_sql_from(sample_size=sample_size) 1405 1406 if access in ["RO"]: 1407 sql_load = ( 1408 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1409 ) 1410 else: 1411 sql_load = ( 1412 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1413 ) 1414 self.conn.execute(sql_load) 1415 1416 except: 1417 # Format not available 1418 log.error(f"Input file format '{self.input_format}' not available") 1419 raise ValueError( 1420 f"Input file format '{self.input_format}' not available" 1421 ) 1422 1423 # SQLite connexion 1424 elif connexion_format in ["sqlite"] and input_format in [ 1425 "vcf", 1426 "tsv", 1427 "csv", 1428 "psv", 1429 ]: 1430 1431 # Main structure 1432 structure = { 1433 "#CHROM": "VARCHAR", 1434 "POS": "INTEGER", 1435 "ID": "VARCHAR", 1436 "REF": "VARCHAR", 1437 "ALT": "VARCHAR", 1438 "QUAL": "VARCHAR", 1439 "FILTER": "VARCHAR", 1440 "INFO": "VARCHAR", 1441 } 1442 1443 # Strcuture with samples 1444 structure_complete = structure 1445 if self.get_header_sample_list(): 1446 structure["FORMAT"] = "VARCHAR" 1447 for sample in self.get_header_sample_list(): 1448 structure_complete[sample] = "VARCHAR" 1449 1450 # Columns list for create and insert 1451 sql_create_table_columns = [] 1452 sql_create_table_columns_list = [] 1453 for column in structure_complete: 1454 column_type = structure_complete[column] 1455 sql_create_table_columns.append( 1456 f'"{column}" {column_type} default NULL' 1457 ) 1458 sql_create_table_columns_list.append(f'"{column}"') 1459 1460 # Create database 1461 log.debug(f"Create Table {table_variants}") 1462 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1463 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1464 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1465 self.conn.execute(sql_create_table) 1466 1467 # chunksize define length of file chunk load file 1468 chunksize = 100000 1469 1470 # delimiter 1471 delimiter = file_format_delimiters.get(input_format, "\t") 1472 1473 # Load the input file 1474 with open(self.input, "rt") as input_file: 1475 1476 # Use the appropriate file handler based on the input format 1477 if input_compressed: 1478 input_file = bgzf.open(self.input, "rt") 1479 if input_format in ["vcf"]: 1480 header_len = self.get_header_length() 1481 else: 1482 header_len = 0 1483 1484 # Insert the file contents into a table 1485 self.insert_file_to_table( 1486 input_file, 1487 columns=sql_create_table_columns_list_sql, 1488 header_len=header_len, 1489 sep=delimiter, 1490 chunksize=chunksize, 1491 ) 1492 1493 else: 1494 log.error( 1495 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1496 ) 1497 raise ValueError( 1498 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1499 ) 1500 1501 # Explode INFOS fields into table fields 1502 if self.get_explode_infos(): 1503 self.explode_infos( 1504 prefix=self.get_explode_infos_prefix(), 1505 fields=self.get_explode_infos_fields(), 1506 force=True, 1507 ) 1508 1509 # Create index after insertion 1510 self.create_indexes() 1511 1512 def get_explode_infos(self) -> bool: 1513 """ 1514 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1515 to False if it is not set. 1516 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1517 value. If the parameter is not present, it will return False. 1518 """ 1519 1520 return self.get_param().get("explode", {}).get("explode_infos", False) 1521 1522 def get_explode_infos_fields( 1523 self, 1524 explode_infos_fields: str = None, 1525 remove_fields_not_in_header: bool = False, 1526 ) -> list: 1527 """ 1528 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1529 the input parameter `explode_infos_fields`. 1530 1531 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1532 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1533 comma-separated list of field names to explode 1534 :type explode_infos_fields: str 1535 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1536 flag that determines whether to remove fields that are not present in the header. If it is set 1537 to `True`, any field that is not in the header will be excluded from the list of exploded 1538 information fields. If it is set to `, defaults to False 1539 :type remove_fields_not_in_header: bool (optional) 1540 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1541 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1542 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1543 Otherwise, it returns a list of exploded information fields after removing any spaces and 1544 splitting the string by commas. 1545 """ 1546 1547 # If no fields, get it in param 1548 if not explode_infos_fields: 1549 explode_infos_fields = ( 1550 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1551 ) 1552 1553 # If no fields, defined as all fields in header using keyword 1554 if not explode_infos_fields: 1555 explode_infos_fields = "*" 1556 1557 # If fields list not empty 1558 if explode_infos_fields: 1559 1560 # Input fields list 1561 if isinstance(explode_infos_fields, str): 1562 fields_input = explode_infos_fields.split(",") 1563 elif isinstance(explode_infos_fields, list): 1564 fields_input = explode_infos_fields 1565 else: 1566 fields_input = [] 1567 1568 # Fields list without * keyword 1569 fields_without_all = fields_input.copy() 1570 if "*".casefold() in (item.casefold() for item in fields_without_all): 1571 fields_without_all.remove("*") 1572 1573 # Fields in header 1574 fields_in_header = sorted(list(set(self.get_header().infos))) 1575 1576 # Construct list of fields 1577 fields_output = [] 1578 for field in fields_input: 1579 1580 # Strip field 1581 field = field.strip() 1582 1583 # format keyword * in regex 1584 if field.upper() in ["*"]: 1585 field = ".*" 1586 1587 # Find all fields with pattern 1588 r = re.compile(rf"^{field}$") 1589 fields_search = sorted(list(filter(r.match, fields_in_header))) 1590 1591 # Remove fields input from search 1592 if field in fields_search: 1593 fields_search = [field] 1594 elif fields_search != [field]: 1595 fields_search = sorted( 1596 list(set(fields_search).difference(fields_input)) 1597 ) 1598 1599 # If field is not in header (avoid not well formatted header) 1600 if not fields_search and not remove_fields_not_in_header: 1601 fields_search = [field] 1602 1603 # Add found fields 1604 for new_field in fields_search: 1605 # Add field, if not already exists, and if it is in header (if asked) 1606 if ( 1607 new_field not in fields_output 1608 and ( 1609 not remove_fields_not_in_header 1610 or new_field in fields_in_header 1611 ) 1612 and new_field not in [".*"] 1613 ): 1614 fields_output.append(new_field) 1615 1616 return fields_output 1617 1618 else: 1619 1620 return [] 1621 1622 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1623 """ 1624 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1625 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1626 not provided. 1627 1628 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1629 prefix to be used for exploding or expanding information 1630 :type explode_infos_prefix: str 1631 :return: the value of the variable `explode_infos_prefix`. 1632 """ 1633 1634 if not explode_infos_prefix: 1635 explode_infos_prefix = ( 1636 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1637 ) 1638 1639 return explode_infos_prefix 1640 1641 def add_column( 1642 self, 1643 table_name, 1644 column_name, 1645 column_type, 1646 default_value=None, 1647 drop: bool = False, 1648 ) -> dict: 1649 """ 1650 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1651 doesn't already exist. 1652 1653 :param table_name: The name of the table to which you want to add a column 1654 :param column_name: The parameter "column_name" is the name of the column that you want to add 1655 to the table 1656 :param column_type: The `column_type` parameter specifies the data type of the column that you 1657 want to add to the table. It should be a string that represents the desired data type, such as 1658 "INTEGER", "TEXT", "REAL", etc 1659 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1660 default value for the newly added column. If a default value is provided, it will be assigned to 1661 the column for any existing rows that do not have a value for that column 1662 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1663 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1664 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1665 to False 1666 :type drop: bool (optional) 1667 :return: a boolean value indicating whether the column was successfully added to the table. 1668 """ 1669 1670 # added 1671 added = False 1672 dropped = False 1673 1674 # Check if the column already exists in the table 1675 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1676 columns = self.get_query_to_df(query).columns.tolist() 1677 if column_name.upper() in [c.upper() for c in columns]: 1678 log.debug( 1679 f"The {column_name} column already exists in the {table_name} table" 1680 ) 1681 if drop: 1682 self.drop_column(table_name=table_name, column_name=column_name) 1683 dropped = True 1684 else: 1685 return None 1686 else: 1687 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1688 1689 # Add column in table 1690 add_column_query = ( 1691 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1692 ) 1693 if default_value is not None: 1694 add_column_query += f" DEFAULT {default_value}" 1695 self.execute_query(add_column_query) 1696 added = not dropped 1697 log.debug( 1698 f"The {column_name} column was successfully added to the {table_name} table" 1699 ) 1700 1701 if added: 1702 added_column = { 1703 "table_name": table_name, 1704 "column_name": column_name, 1705 "column_type": column_type, 1706 "default_value": default_value, 1707 } 1708 else: 1709 added_column = None 1710 1711 return added_column 1712 1713 def drop_column( 1714 self, column: dict = None, table_name: str = None, column_name: str = None 1715 ) -> bool: 1716 """ 1717 The `drop_column` function drops a specified column from a given table in a database and returns 1718 True if the column was successfully dropped, and False if the column does not exist in the 1719 table. 1720 1721 :param column: The `column` parameter is a dictionary that contains information about the column 1722 you want to drop. It has two keys: 1723 :type column: dict 1724 :param table_name: The `table_name` parameter is the name of the table from which you want to 1725 drop a column 1726 :type table_name: str 1727 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1728 from the table 1729 :type column_name: str 1730 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1731 and False if the column does not exist in the table. 1732 """ 1733 1734 # Find column infos 1735 if column: 1736 if isinstance(column, dict): 1737 table_name = column.get("table_name", None) 1738 column_name = column.get("column_name", None) 1739 elif isinstance(column, str): 1740 table_name = self.get_table_variants() 1741 column_name = column 1742 else: 1743 table_name = None 1744 column_name = None 1745 1746 if not table_name and not column_name: 1747 return False 1748 1749 # Removed 1750 removed = False 1751 1752 # Check if the column already exists in the table 1753 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1754 columns = self.get_query_to_df(query).columns.tolist() 1755 if column_name in columns: 1756 log.debug(f"The {column_name} column exists in the {table_name} table") 1757 else: 1758 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1759 return False 1760 1761 # Add column in table # ALTER TABLE integers DROP k 1762 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1763 self.execute_query(add_column_query) 1764 removed = True 1765 log.debug( 1766 f"The {column_name} column was successfully dropped to the {table_name} table" 1767 ) 1768 1769 return removed 1770 1771 def explode_infos( 1772 self, 1773 prefix: str = None, 1774 create_index: bool = False, 1775 fields: list = None, 1776 force: bool = False, 1777 proccess_all_fields_together: bool = False, 1778 table: str = None, 1779 ) -> list: 1780 """ 1781 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1782 individual columns, returning a list of added columns. 1783 1784 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1785 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1786 `self.get_explode_infos_prefix()` as the prefix 1787 :type prefix: str 1788 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1789 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1790 `False`, indexes will not be created. The default value is `False`, defaults to False 1791 :type create_index: bool (optional) 1792 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1793 that you want to explode into individual columns. If this parameter is not provided, all INFO 1794 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1795 a list to the ` 1796 :type fields: list 1797 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1798 determines whether to drop and recreate a column if it already exists in the table. If `force` 1799 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1800 defaults to False 1801 :type force: bool (optional) 1802 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1803 flag that determines whether to process all the INFO fields together or individually. If set to 1804 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1805 be processed individually. The default value is, defaults to False 1806 :type proccess_all_fields_together: bool (optional) 1807 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1808 of the table where the exploded INFO fields will be added as individual columns. If you provide 1809 a value for the `table` parameter, the function will use that table name. If the `table` 1810 parameter is 1811 :type table: str 1812 :return: The `explode_infos` function returns a list of added columns. 1813 """ 1814 1815 # drop indexes 1816 self.drop_indexes() 1817 1818 # connexion format 1819 connexion_format = self.get_connexion_format() 1820 1821 # Access 1822 access = self.get_config().get("access", None) 1823 1824 # Added columns 1825 added_columns = [] 1826 1827 if access not in ["RO"]: 1828 1829 # prefix 1830 if prefix in [None, True] or not isinstance(prefix, str): 1831 if self.get_explode_infos_prefix() not in [None, True]: 1832 prefix = self.get_explode_infos_prefix() 1833 else: 1834 prefix = "INFO/" 1835 1836 # table variants 1837 if table is not None: 1838 table_variants = table 1839 else: 1840 table_variants = self.get_table_variants(clause="select") 1841 1842 # extra infos 1843 try: 1844 extra_infos = self.get_extra_infos() 1845 except: 1846 extra_infos = [] 1847 1848 # Header infos 1849 header_infos = self.get_header().infos 1850 1851 log.debug( 1852 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1853 ) 1854 1855 sql_info_alter_table_array = [] 1856 1857 # Info fields to check 1858 fields_list = list(header_infos) 1859 if fields: 1860 fields_list += fields 1861 fields_list = set(fields_list) 1862 1863 # If no fields 1864 if not fields: 1865 fields = [] 1866 1867 # Translate fields if patterns 1868 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1869 1870 for info in fields: 1871 1872 info_id_sql = prefix + info 1873 1874 if ( 1875 info in fields_list 1876 or prefix + info in fields_list 1877 or info in extra_infos 1878 ): 1879 1880 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1881 1882 if info in header_infos: 1883 info_type = header_infos[info].type 1884 info_num = header_infos[info].num 1885 else: 1886 info_type = "String" 1887 info_num = 0 1888 1889 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1890 if info_num != 1: 1891 type_sql = "VARCHAR" 1892 1893 # Add field 1894 added_column = self.add_column( 1895 table_name=table_variants, 1896 column_name=info_id_sql, 1897 column_type=type_sql, 1898 default_value="null", 1899 drop=force, 1900 ) 1901 1902 if added_column: 1903 added_columns.append(added_column) 1904 1905 if added_column or force: 1906 1907 # add field to index 1908 self.index_additionnal_fields.append(info_id_sql) 1909 1910 # Update field array 1911 if connexion_format in ["duckdb"]: 1912 update_info_field = f""" 1913 "{info_id_sql}" = 1914 CASE 1915 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1916 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1917 END 1918 """ 1919 elif connexion_format in ["sqlite"]: 1920 update_info_field = f""" 1921 "{info_id_sql}" = 1922 CASE 1923 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1924 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1925 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1926 END 1927 """ 1928 1929 sql_info_alter_table_array.append(update_info_field) 1930 1931 if sql_info_alter_table_array: 1932 1933 # By chromosomes 1934 try: 1935 chromosomes_list = list( 1936 self.get_query_to_df( 1937 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1938 )["#CHROM"] 1939 ) 1940 except: 1941 chromosomes_list = [None] 1942 1943 for chrom in chromosomes_list: 1944 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1945 1946 # Where clause 1947 where_clause = "" 1948 if chrom and len(chromosomes_list) > 1: 1949 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1950 1951 # Update table 1952 if proccess_all_fields_together: 1953 sql_info_alter_table_array_join = ", ".join( 1954 sql_info_alter_table_array 1955 ) 1956 if sql_info_alter_table_array_join: 1957 sql_info_alter_table = f""" 1958 UPDATE {table_variants} 1959 SET {sql_info_alter_table_array_join} 1960 {where_clause} 1961 """ 1962 log.debug( 1963 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1964 ) 1965 # log.debug(sql_info_alter_table) 1966 self.conn.execute(sql_info_alter_table) 1967 else: 1968 sql_info_alter_num = 0 1969 for sql_info_alter in sql_info_alter_table_array: 1970 sql_info_alter_num += 1 1971 sql_info_alter_table = f""" 1972 UPDATE {table_variants} 1973 SET {sql_info_alter} 1974 {where_clause} 1975 """ 1976 log.debug( 1977 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1978 ) 1979 # log.debug(sql_info_alter_table) 1980 self.conn.execute(sql_info_alter_table) 1981 1982 # create indexes 1983 if create_index: 1984 self.create_indexes() 1985 1986 return added_columns 1987 1988 def create_indexes(self) -> None: 1989 """ 1990 Create indexes on the table after insertion 1991 """ 1992 1993 # Access 1994 access = self.get_config().get("access", None) 1995 1996 # get table variants 1997 table_variants = self.get_table_variants("FROM") 1998 1999 if self.get_indexing() and access not in ["RO"]: 2000 # Create index 2001 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 2002 self.conn.execute(sql_create_table_index) 2003 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 2004 self.conn.execute(sql_create_table_index) 2005 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 2006 self.conn.execute(sql_create_table_index) 2007 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 2008 self.conn.execute(sql_create_table_index) 2009 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 2010 self.conn.execute(sql_create_table_index) 2011 for field in self.index_additionnal_fields: 2012 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 2013 self.conn.execute(sql_create_table_index) 2014 2015 def drop_indexes(self) -> None: 2016 """ 2017 Create indexes on the table after insertion 2018 """ 2019 2020 # Access 2021 access = self.get_config().get("access", None) 2022 2023 # get table variants 2024 table_variants = self.get_table_variants("FROM") 2025 2026 # Get database format 2027 connexion_format = self.get_connexion_format() 2028 2029 if access not in ["RO"]: 2030 if connexion_format in ["duckdb"]: 2031 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2032 elif connexion_format in ["sqlite"]: 2033 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2034 2035 list_indexes = self.conn.execute(sql_list_indexes) 2036 index_names = [row[0] for row in list_indexes.fetchall()] 2037 for index in index_names: 2038 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2039 self.conn.execute(sql_drop_table_index) 2040 2041 def read_vcf_header(self, f) -> list: 2042 """ 2043 It reads the header of a VCF file and returns a list of the header lines 2044 2045 :param f: the file object 2046 :return: The header lines of the VCF file. 2047 """ 2048 2049 header_list = [] 2050 for line in f: 2051 header_list.append(line) 2052 if line.startswith("#CHROM"): 2053 break 2054 return header_list 2055 2056 def read_vcf_header_file(self, file: str = None) -> list: 2057 """ 2058 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2059 uncompressed files. 2060 2061 :param file: The `file` parameter is a string that represents the path to the VCF header file 2062 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2063 default to `None` 2064 :type file: str 2065 :return: The function `read_vcf_header_file` returns a list. 2066 """ 2067 2068 if self.get_input_compressed(input_file=file): 2069 with bgzf.open(file, "rt") as f: 2070 return self.read_vcf_header(f=f) 2071 else: 2072 with open(file, "rt") as f: 2073 return self.read_vcf_header(f=f) 2074 2075 def execute_query(self, query: str): 2076 """ 2077 It takes a query as an argument, executes it, and returns the results 2078 2079 :param query: The query to be executed 2080 :return: The result of the query is being returned. 2081 """ 2082 if query: 2083 return self.conn.execute(query) # .fetchall() 2084 else: 2085 return None 2086 2087 def export_output( 2088 self, 2089 output_file: str | None = None, 2090 output_header: str | None = None, 2091 export_header: bool = True, 2092 query: str | None = None, 2093 parquet_partitions: list | None = None, 2094 chunk_size: int | None = None, 2095 threads: int | None = None, 2096 sort: bool = False, 2097 index: bool = False, 2098 order_by: str | None = None, 2099 fields_to_rename: dict | None = None, 2100 ) -> bool: 2101 """ 2102 The `export_output` function exports data from a VCF file to various formats, including VCF, 2103 CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and 2104 partitioning. 2105 2106 :param output_file: The `output_file` parameter is a string that specifies the name of the 2107 output file where the exported data will be saved 2108 :type output_file: str | None 2109 :param output_header: The `output_header` parameter is a string that specifies the name of the 2110 file where the header of the VCF file will be exported. If this parameter is not provided, the 2111 header will be exported to a file with the same name as the `output_file` parameter, but with 2112 the extension " 2113 :type output_header: str | None 2114 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2115 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2116 True, the header will be exported to a file. If `export_header` is False, the header will not 2117 be, defaults to True 2118 :type export_header: bool (optional) 2119 :param query: The `query` parameter in the `export_output` function is an optional SQL query 2120 that can be used to filter and select specific data from the VCF file before exporting it. If 2121 provided, only the data that matches the query will be exported. This allows you to customize 2122 the exported data based on 2123 :type query: str | None 2124 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2125 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2126 organize data in a hierarchical directory structure based on the values of one or more columns. 2127 This can improve query performance when working with large datasets 2128 :type parquet_partitions: list | None 2129 :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when 2130 exporting data in Parquet format. This parameter is used for partitioning the Parquet file into 2131 multiple files. It helps in optimizing the export process by breaking down the data into 2132 manageable chunks for processing and storage 2133 :type chunk_size: int | None 2134 :param threads: The `threads` parameter in the `export_output` function specifies the number of 2135 threads to be used during the export process. It determines the level of parallelism and can 2136 improve the performance of the export operation. If this parameter is not provided, the function 2137 will use the default number of threads 2138 :type threads: int | None 2139 :param sort: The `sort` parameter in the `export_output` function is a boolean flag that 2140 determines whether the output file should be sorted based on genomic coordinates of the 2141 variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to 2142 `False`,, defaults to False 2143 :type sort: bool (optional) 2144 :param index: The `index` parameter in the `export_output` function is a boolean flag that 2145 determines whether an index should be created on the output file. If `index` is set to `True`, 2146 an index will be created on the output file. If `index` is set to `False`, no, defaults to False 2147 :type index: bool (optional) 2148 :param order_by: The `order_by` parameter in the `export_output` function is a string that 2149 specifies the column(s) to use for sorting the output file. This parameter is only applicable 2150 when exporting data in VCF format. It allows you to specify the column(s) based on which the 2151 output file should be 2152 :type order_by: str | None 2153 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the 2154 mapping of field names to be renamed during the export process. This parameter allows you to 2155 customize the output field names before exporting the data. Each key-value pair in the 2156 dictionary represents the original field name as the key and the new field name 2157 :type fields_to_rename: dict | None 2158 :return: The `export_output` function returns a boolean value. It checks if the output file 2159 exists and returns True if it does, or None if it doesn't. 2160 """ 2161 2162 # Log 2163 log.info("Exporting...") 2164 2165 # Full path 2166 output_file = full_path(output_file) 2167 output_header = full_path(output_header) 2168 2169 # Config 2170 config = self.get_config() 2171 2172 # Param 2173 param = self.get_param() 2174 2175 # Tmp files to remove 2176 tmp_to_remove = [] 2177 2178 # If no output, get it 2179 if not output_file: 2180 output_file = self.get_output() 2181 2182 # If not threads 2183 if not threads: 2184 threads = self.get_threads() 2185 2186 # Rename fields 2187 if not fields_to_rename: 2188 fields_to_rename = param.get("export", {}).get("fields_to_rename", None) 2189 self.rename_info_fields(fields_to_rename=fields_to_rename) 2190 2191 # Auto header name with extension 2192 if export_header or output_header: 2193 if not output_header: 2194 output_header = f"{output_file}.hdr" 2195 # Export header 2196 self.export_header(output_file=output_file) 2197 2198 # Switch off export header if VCF output 2199 output_file_type = get_file_format(output_file) 2200 if output_file_type in ["vcf"]: 2201 export_header = False 2202 tmp_to_remove.append(output_header) 2203 2204 # Chunk size 2205 if not chunk_size: 2206 chunk_size = config.get("chunk_size", None) 2207 2208 # Parquet partition 2209 if not parquet_partitions: 2210 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2211 if parquet_partitions and isinstance(parquet_partitions, str): 2212 parquet_partitions = parquet_partitions.split(",") 2213 2214 # Order by 2215 if not order_by: 2216 order_by = param.get("export", {}).get("order_by", "") 2217 2218 # Header in output 2219 header_in_output = param.get("export", {}).get("include_header", False) 2220 2221 # Database 2222 database_source = self.get_connexion() 2223 2224 # Connexion format 2225 connexion_format = self.get_connexion_format() 2226 2227 # Explode infos 2228 if self.get_explode_infos(): 2229 self.explode_infos( 2230 prefix=self.get_explode_infos_prefix(), 2231 fields=self.get_explode_infos_fields(), 2232 force=False, 2233 ) 2234 2235 # if connexion_format in ["sqlite"] or query: 2236 if connexion_format in ["sqlite"]: 2237 2238 # Export in Parquet 2239 random_tmp = "".join( 2240 random.choice(string.ascii_lowercase) for i in range(10) 2241 ) 2242 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2243 tmp_to_remove.append(database_source) 2244 2245 # Table Variants 2246 table_variants = self.get_table_variants() 2247 2248 # Create export query 2249 sql_query_export_subquery = f""" 2250 SELECT * FROM {table_variants} 2251 """ 2252 2253 # Write source file 2254 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2255 2256 # Create database 2257 database = Database( 2258 database=database_source, 2259 table="variants", 2260 header_file=output_header, 2261 conn_config=self.get_connexion_config(), 2262 ) 2263 2264 # Existing colomns header 2265 existing_columns_header = database.get_header_columns_from_database(query=query) 2266 2267 # Sample list 2268 if output_file_type in ["vcf"]: 2269 get_samples = self.get_samples() 2270 get_samples_check = self.get_samples_check() 2271 samples_force = get_samples is not None 2272 sample_list = self.get_header_sample_list( 2273 check=get_samples_check, 2274 samples=get_samples, 2275 samples_force=samples_force, 2276 ) 2277 else: 2278 sample_list = None 2279 2280 # Export file 2281 database.export( 2282 output_database=output_file, 2283 output_header=output_header, 2284 existing_columns_header=existing_columns_header, 2285 parquet_partitions=parquet_partitions, 2286 chunk_size=chunk_size, 2287 threads=threads, 2288 sort=sort, 2289 index=index, 2290 header_in_output=header_in_output, 2291 order_by=order_by, 2292 query=query, 2293 export_header=export_header, 2294 sample_list=sample_list, 2295 ) 2296 2297 # Remove 2298 remove_if_exists(tmp_to_remove) 2299 2300 return (os.path.exists(output_file) or None) and ( 2301 os.path.exists(output_file) or None 2302 ) 2303 2304 def get_extra_infos(self, table: str = None) -> list: 2305 """ 2306 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2307 in the header. 2308 2309 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2310 name of the table from which you want to retrieve the extra columns that are not present in the 2311 header. If the `table` parameter is not provided when calling the function, it will default to 2312 using the variants 2313 :type table: str 2314 :return: A list of columns that are in the specified table but not in the header of the table. 2315 """ 2316 2317 header_columns = [] 2318 2319 if not table: 2320 table = self.get_table_variants(clause="from") 2321 header_columns = self.get_header_columns() 2322 2323 # Check all columns in the database 2324 query = f""" SELECT * FROM {table} LIMIT 1 """ 2325 log.debug(f"query {query}") 2326 table_columns = self.get_query_to_df(query).columns.tolist() 2327 extra_columns = [] 2328 2329 # Construct extra infos (not in header) 2330 for column in table_columns: 2331 if column not in header_columns: 2332 extra_columns.append(column) 2333 2334 return extra_columns 2335 2336 def get_extra_infos_sql(self, table: str = None) -> str: 2337 """ 2338 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2339 by double quotes 2340 2341 :param table: The name of the table to get the extra infos from. If None, the default table is 2342 used 2343 :type table: str 2344 :return: A string of the extra infos 2345 """ 2346 2347 return ", ".join( 2348 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2349 ) 2350 2351 def export_header( 2352 self, 2353 header_name: str = None, 2354 output_file: str = None, 2355 output_file_ext: str = ".hdr", 2356 clean_header: bool = True, 2357 remove_chrom_line: bool = False, 2358 ) -> str: 2359 """ 2360 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2361 specified options, and writes it to a new file. 2362 2363 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2364 this parameter is not specified, the header will be written to the output file 2365 :type header_name: str 2366 :param output_file: The `output_file` parameter in the `export_header` function is used to 2367 specify the name of the output file where the header will be written. If this parameter is not 2368 provided, the header will be written to a temporary file 2369 :type output_file: str 2370 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2371 string that represents the extension of the output header file. By default, it is set to ".hdr" 2372 if not specified by the user. This extension will be appended to the `output_file` name to 2373 create the final, defaults to .hdr 2374 :type output_file_ext: str (optional) 2375 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2376 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2377 `True`, the function will clean the header by modifying certain lines based on a specific 2378 pattern. If `clean_header`, defaults to True 2379 :type clean_header: bool (optional) 2380 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2381 boolean flag that determines whether the #CHROM line should be removed from the header before 2382 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2383 defaults to False 2384 :type remove_chrom_line: bool (optional) 2385 :return: The function `export_header` returns the name of the temporary header file that is 2386 created. 2387 """ 2388 2389 if not header_name and not output_file: 2390 output_file = self.get_output() 2391 2392 if self.get_header(): 2393 2394 # Get header object 2395 header_obj = self.get_header() 2396 2397 # Create database 2398 db_for_header = Database(database=self.get_input()) 2399 2400 # Get real columns in the file 2401 db_header_columns = db_for_header.get_columns() 2402 2403 with tempfile.TemporaryDirectory() as tmpdir: 2404 2405 # Write header file 2406 header_file_tmp = os.path.join(tmpdir, "header") 2407 f = open(header_file_tmp, "w") 2408 vcf.Writer(f, header_obj) 2409 f.close() 2410 2411 # Replace #CHROM line with rel columns 2412 header_list = db_for_header.read_header_file( 2413 header_file=header_file_tmp 2414 ) 2415 header_list[-1] = "\t".join(db_header_columns) 2416 2417 # Remove CHROM line 2418 if remove_chrom_line: 2419 header_list.pop() 2420 2421 # Clean header 2422 if clean_header: 2423 header_list_clean = [] 2424 for head in header_list: 2425 # Clean head for malformed header 2426 head_clean = head 2427 head_clean = re.subn( 2428 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2429 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2430 head_clean, 2431 2, 2432 )[0] 2433 # Write header 2434 header_list_clean.append(head_clean) 2435 header_list = header_list_clean 2436 2437 tmp_header_name = output_file + output_file_ext 2438 2439 f = open(tmp_header_name, "w") 2440 for line in header_list: 2441 f.write(line) 2442 f.close() 2443 2444 return tmp_header_name 2445 2446 def export_variant_vcf( 2447 self, 2448 vcf_file, 2449 remove_info: bool = False, 2450 add_samples: bool = True, 2451 list_samples: list = [], 2452 where_clause: str = "", 2453 index: bool = False, 2454 threads: int | None = None, 2455 ) -> bool | None: 2456 """ 2457 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2458 remove INFO field, add samples, and control compression and indexing. 2459 2460 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2461 written to. It is the output file that will contain the filtered VCF data based on the specified 2462 parameters 2463 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2464 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2465 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2466 in, defaults to False 2467 :type remove_info: bool (optional) 2468 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2469 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2470 If set to False, the samples will be removed. The default value is True, defaults to True 2471 :type add_samples: bool (optional) 2472 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2473 in the output VCF file. By default, all samples will be included. If you provide a list of 2474 samples, only those samples will be included in the output file 2475 :type list_samples: list 2476 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2477 determines whether or not to create an index for the output VCF file. If `index` is set to 2478 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2479 :type index: bool (optional) 2480 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2481 number of threads to use for exporting the VCF file. It determines how many parallel threads 2482 will be used during the export process. More threads can potentially speed up the export process 2483 by utilizing multiple cores of the processor. If 2484 :type threads: int | None 2485 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2486 method with various parameters including the output file, query, threads, sort flag, and index 2487 flag. The `export_output` method is responsible for exporting the VCF data based on the 2488 specified parameters and configurations provided in the `export_variant_vcf` function. 2489 """ 2490 2491 # Config 2492 config = self.get_config() 2493 2494 # Extract VCF 2495 log.debug("Export VCF...") 2496 2497 # Table variants 2498 table_variants = self.get_table_variants() 2499 2500 # Threads 2501 if not threads: 2502 threads = self.get_threads() 2503 2504 # Info fields 2505 if remove_info: 2506 if not isinstance(remove_info, str): 2507 remove_info = "." 2508 info_field = f"""'{remove_info}' as INFO""" 2509 else: 2510 info_field = "INFO" 2511 2512 # Samples fields 2513 if add_samples: 2514 if not list_samples: 2515 list_samples = self.get_header_sample_list() 2516 if list_samples: 2517 samples_fields = " , FORMAT , " + " , ".join( 2518 [f""" "{sample}" """ for sample in list_samples] 2519 ) 2520 else: 2521 samples_fields = "" 2522 log.debug(f"samples_fields: {samples_fields}") 2523 else: 2524 samples_fields = "" 2525 2526 # Where clause 2527 if where_clause is None: 2528 where_clause = "" 2529 2530 # Variants 2531 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2532 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2533 log.debug(f"sql_query_select={sql_query_select}") 2534 2535 return self.export_output( 2536 output_file=vcf_file, 2537 output_header=None, 2538 export_header=True, 2539 query=sql_query_select, 2540 parquet_partitions=None, 2541 chunk_size=config.get("chunk_size", None), 2542 threads=threads, 2543 sort=True, 2544 index=index, 2545 order_by=None, 2546 ) 2547 2548 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2549 """ 2550 It takes a list of commands and runs them in parallel using the number of threads specified 2551 2552 :param commands: A list of commands to run 2553 :param threads: The number of threads to use, defaults to 1 (optional) 2554 """ 2555 2556 run_parallel_commands(commands, threads) 2557 2558 def get_threads(self, default: int = 1) -> int: 2559 """ 2560 This function returns the number of threads to use for a job, with a default value of 1 if not 2561 specified. 2562 2563 :param default: The `default` parameter in the `get_threads` method is used to specify the 2564 default number of threads to use if no specific value is provided. If no value is provided for 2565 the `threads` parameter in the configuration or input parameters, the `default` value will be 2566 used, defaults to 1 2567 :type default: int (optional) 2568 :return: the number of threads to use for the current job. 2569 """ 2570 2571 # Config 2572 config = self.get_config() 2573 2574 # Param 2575 param = self.get_param() 2576 2577 # Input threads 2578 input_thread = param.get("threads", config.get("threads", None)) 2579 2580 # Check threads 2581 if not input_thread: 2582 threads = default 2583 elif int(input_thread) <= 0: 2584 threads = os.cpu_count() 2585 else: 2586 threads = int(input_thread) 2587 return threads 2588 2589 def get_memory(self, default: str = None) -> str: 2590 """ 2591 This function retrieves the memory value from parameters or configuration with a default value 2592 if not found. 2593 2594 :param default: The `get_memory` function takes in a default value as a string parameter. This 2595 default value is used as a fallback in case the `memory` parameter is not provided in the 2596 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2597 the function 2598 :type default: str 2599 :return: The `get_memory` function returns a string value representing the memory parameter. If 2600 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2601 return the default value provided as an argument to the function. 2602 """ 2603 2604 # Config 2605 config = self.get_config() 2606 2607 # Param 2608 param = self.get_param() 2609 2610 # Input threads 2611 input_memory = param.get("memory", config.get("memory", None)) 2612 2613 # Check threads 2614 if input_memory: 2615 memory = input_memory 2616 else: 2617 memory = default 2618 2619 return memory 2620 2621 def update_from_vcf(self, vcf_file: str) -> None: 2622 """ 2623 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2624 2625 :param vcf_file: the path to the VCF file 2626 """ 2627 2628 connexion_format = self.get_connexion_format() 2629 2630 if connexion_format in ["duckdb"]: 2631 self.update_from_vcf_duckdb(vcf_file) 2632 elif connexion_format in ["sqlite"]: 2633 self.update_from_vcf_sqlite(vcf_file) 2634 2635 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2636 """ 2637 It takes a VCF file and updates the INFO column of the variants table in the database with the 2638 INFO column of the VCF file 2639 2640 :param vcf_file: the path to the VCF file 2641 """ 2642 2643 # varaints table 2644 table_variants = self.get_table_variants() 2645 2646 # Loading VCF into temporaire table 2647 skip = self.get_header_length(file=vcf_file) 2648 vcf_df = pd.read_csv( 2649 vcf_file, 2650 sep="\t", 2651 engine="c", 2652 skiprows=skip, 2653 header=0, 2654 low_memory=False, 2655 ) 2656 sql_query_update = f""" 2657 UPDATE {table_variants} as table_variants 2658 SET INFO = concat( 2659 CASE 2660 WHEN INFO NOT IN ('', '.') 2661 THEN INFO 2662 ELSE '' 2663 END, 2664 ( 2665 SELECT 2666 concat( 2667 CASE 2668 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2669 THEN ';' 2670 ELSE '' 2671 END 2672 , 2673 CASE 2674 WHEN table_parquet.INFO NOT IN ('','.') 2675 THEN table_parquet.INFO 2676 ELSE '' 2677 END 2678 ) 2679 FROM vcf_df as table_parquet 2680 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2681 AND table_parquet.\"POS\" = table_variants.\"POS\" 2682 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2683 AND table_parquet.\"REF\" = table_variants.\"REF\" 2684 AND table_parquet.INFO NOT IN ('','.') 2685 ) 2686 ) 2687 ; 2688 """ 2689 self.conn.execute(sql_query_update) 2690 2691 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2692 """ 2693 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2694 table, then updates the INFO column of the variants table with the INFO column of the temporary 2695 table 2696 2697 :param vcf_file: The path to the VCF file you want to update the database with 2698 """ 2699 2700 # Create a temporary table for the VCF 2701 table_vcf = "tmp_vcf" 2702 sql_create = ( 2703 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2704 ) 2705 self.conn.execute(sql_create) 2706 2707 # Loading VCF into temporaire table 2708 vcf_df = pd.read_csv( 2709 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2710 ) 2711 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2712 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2713 2714 # Update table 'variants' with VCF data 2715 # warning: CONCAT as || operator 2716 sql_query_update = f""" 2717 UPDATE variants as table_variants 2718 SET INFO = CASE 2719 WHEN INFO NOT IN ('', '.') 2720 THEN INFO 2721 ELSE '' 2722 END || 2723 ( 2724 SELECT 2725 CASE 2726 WHEN table_variants.INFO NOT IN ('','.') 2727 AND table_vcf.INFO NOT IN ('','.') 2728 THEN ';' 2729 ELSE '' 2730 END || 2731 CASE 2732 WHEN table_vcf.INFO NOT IN ('','.') 2733 THEN table_vcf.INFO 2734 ELSE '' 2735 END 2736 FROM {table_vcf} as table_vcf 2737 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2738 AND table_vcf.\"POS\" = table_variants.\"POS\" 2739 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2740 AND table_vcf.\"REF\" = table_variants.\"REF\" 2741 ) 2742 """ 2743 self.conn.execute(sql_query_update) 2744 2745 # Drop temporary table 2746 sql_drop = f"DROP TABLE {table_vcf}" 2747 self.conn.execute(sql_drop) 2748 2749 def drop_variants_table(self) -> None: 2750 """ 2751 > This function drops the variants table 2752 """ 2753 2754 table_variants = self.get_table_variants() 2755 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2756 self.conn.execute(sql_table_variants) 2757 2758 def set_variant_id( 2759 self, variant_id_column: str = "variant_id", force: bool = None 2760 ) -> str: 2761 """ 2762 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2763 `#CHROM`, `POS`, `REF`, and `ALT` columns 2764 2765 :param variant_id_column: The name of the column to be created in the variants table, defaults 2766 to variant_id 2767 :type variant_id_column: str (optional) 2768 :param force: If True, the variant_id column will be created even if it already exists 2769 :type force: bool 2770 :return: The name of the column that contains the variant_id 2771 """ 2772 2773 # Assembly 2774 assembly = self.get_param().get( 2775 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2776 ) 2777 2778 # INFO/Tag prefix 2779 prefix = self.get_explode_infos_prefix() 2780 2781 # Explode INFO/SVTYPE 2782 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2783 2784 # variants table 2785 table_variants = self.get_table_variants() 2786 2787 # variant_id column 2788 if not variant_id_column: 2789 variant_id_column = "variant_id" 2790 2791 # Creta variant_id column 2792 if "variant_id" not in self.get_extra_infos() or force: 2793 2794 # Create column 2795 self.add_column( 2796 table_name=table_variants, 2797 column_name=variant_id_column, 2798 column_type="UBIGINT", 2799 default_value="0", 2800 ) 2801 2802 # Update column 2803 self.conn.execute( 2804 f""" 2805 UPDATE {table_variants} 2806 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2807 """ 2808 ) 2809 2810 # Remove added columns 2811 for added_column in added_columns: 2812 self.drop_column(column=added_column) 2813 2814 # return variant_id column name 2815 return variant_id_column 2816 2817 def get_variant_id_column( 2818 self, variant_id_column: str = "variant_id", force: bool = None 2819 ) -> str: 2820 """ 2821 This function returns the variant_id column name 2822 2823 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2824 defaults to variant_id 2825 :type variant_id_column: str (optional) 2826 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2827 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2828 if it is not already set, or if it is set 2829 :type force: bool 2830 :return: The variant_id column name. 2831 """ 2832 2833 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2834 2835 ### 2836 # Annotation 2837 ### 2838 2839 def scan_databases( 2840 self, 2841 database_formats: list = ["parquet"], 2842 database_releases: list = ["current"], 2843 ) -> dict: 2844 """ 2845 The function `scan_databases` scans for available databases based on specified formats and 2846 releases. 2847 2848 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2849 of the databases to be scanned. In this case, the accepted format is "parquet" 2850 :type database_formats: list ["parquet"] 2851 :param database_releases: The `database_releases` parameter is a list that specifies the 2852 releases of the databases to be scanned. In the provided function, the default value for 2853 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2854 databases that are in the "current" 2855 :type database_releases: list 2856 :return: The function `scan_databases` returns a dictionary containing information about 2857 databases that match the specified formats and releases. 2858 """ 2859 2860 # Config 2861 config = self.get_config() 2862 2863 # Param 2864 param = self.get_param() 2865 2866 # Param - Assembly 2867 assembly = param.get("assembly", config.get("assembly", None)) 2868 if not assembly: 2869 assembly = DEFAULT_ASSEMBLY 2870 log.warning(f"Default assembly '{assembly}'") 2871 2872 # Scan for availabled databases 2873 log.info( 2874 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2875 ) 2876 databases_infos_dict = databases_infos( 2877 database_folder_releases=database_releases, 2878 database_formats=database_formats, 2879 assembly=assembly, 2880 config=config, 2881 ) 2882 log.info( 2883 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2884 ) 2885 2886 return databases_infos_dict 2887 2888 def annotation(self) -> None: 2889 """ 2890 It annotates the VCF file with the annotations specified in the config file. 2891 """ 2892 2893 # Config 2894 config = self.get_config() 2895 2896 # Param 2897 param = self.get_param() 2898 2899 # Param - Assembly 2900 assembly = param.get("assembly", config.get("assembly", None)) 2901 if not assembly: 2902 assembly = DEFAULT_ASSEMBLY 2903 log.warning(f"Default assembly '{assembly}'") 2904 2905 # annotations databases folders 2906 annotations_databases = set( 2907 config.get("folders", {}) 2908 .get("databases", {}) 2909 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2910 + config.get("folders", {}) 2911 .get("databases", {}) 2912 .get("parquet", ["~/howard/databases/parquet/current"]) 2913 + config.get("folders", {}) 2914 .get("databases", {}) 2915 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2916 ) 2917 2918 # Get param annotations 2919 if param.get("annotations", None) and isinstance( 2920 param.get("annotations", None), str 2921 ): 2922 log.debug(param.get("annotations", None)) 2923 param_annotation_list = param.get("annotations").split(",") 2924 else: 2925 param_annotation_list = [] 2926 2927 # Each tools param 2928 if param.get("annotation_parquet", None) != None: 2929 log.debug( 2930 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2931 ) 2932 if isinstance(param.get("annotation_parquet", None), list): 2933 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2934 else: 2935 param_annotation_list.append(param.get("annotation_parquet")) 2936 if param.get("annotation_snpsift", None) != None: 2937 if isinstance(param.get("annotation_snpsift", None), list): 2938 param_annotation_list.append( 2939 "snpsift:" 2940 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2941 ) 2942 else: 2943 param_annotation_list.append( 2944 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2945 ) 2946 if param.get("annotation_snpeff", None) != None: 2947 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2948 if param.get("annotation_bcftools", None) != None: 2949 if isinstance(param.get("annotation_bcftools", None), list): 2950 param_annotation_list.append( 2951 "bcftools:" 2952 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2953 ) 2954 else: 2955 param_annotation_list.append( 2956 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2957 ) 2958 if param.get("annotation_annovar", None) != None: 2959 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2960 if param.get("annotation_exomiser", None) != None: 2961 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2962 if param.get("annotation_splice", None) != None: 2963 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2964 2965 # Merge param annotations list 2966 param["annotations"] = ",".join(param_annotation_list) 2967 2968 # debug 2969 log.debug(f"param_annotations={param['annotations']}") 2970 2971 if param.get("annotations"): 2972 2973 # Log 2974 # log.info("Annotations - Check annotation parameters") 2975 2976 if not "annotation" in param: 2977 param["annotation"] = {} 2978 2979 # List of annotations parameters 2980 annotations_list_input = {} 2981 if isinstance(param.get("annotations", None), str): 2982 annotation_file_list = [ 2983 value for value in param.get("annotations", "").split(",") 2984 ] 2985 for annotation_file in annotation_file_list: 2986 annotations_list_input[annotation_file.strip()] = {"INFO": None} 2987 else: 2988 annotations_list_input = param.get("annotations", {}) 2989 2990 log.info(f"Quick Annotations:") 2991 for annotation_key in list(annotations_list_input.keys()): 2992 log.info(f" {annotation_key}") 2993 2994 # List of annotations and associated fields 2995 annotations_list = {} 2996 2997 for annotation_file in annotations_list_input: 2998 2999 # Explode annotations if ALL 3000 if ( 3001 annotation_file.upper() == "ALL" 3002 or annotation_file.upper().startswith("ALL:") 3003 ): 3004 3005 # check ALL parameters (formats, releases) 3006 annotation_file_split = annotation_file.split(":") 3007 database_formats = "parquet" 3008 database_releases = "current" 3009 for annotation_file_option in annotation_file_split[1:]: 3010 database_all_options_split = annotation_file_option.split("=") 3011 if database_all_options_split[0] == "format": 3012 database_formats = database_all_options_split[1].split("+") 3013 if database_all_options_split[0] == "release": 3014 database_releases = database_all_options_split[1].split("+") 3015 3016 # Scan for availabled databases 3017 databases_infos_dict = self.scan_databases( 3018 database_formats=database_formats, 3019 database_releases=database_releases, 3020 ) 3021 3022 # Add found databases in annotation parameters 3023 for database_infos in databases_infos_dict.keys(): 3024 annotations_list[database_infos] = {"INFO": None} 3025 3026 else: 3027 annotations_list[annotation_file] = annotations_list_input[ 3028 annotation_file 3029 ] 3030 3031 # Check each databases 3032 if len(annotations_list): 3033 3034 log.info( 3035 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3036 ) 3037 3038 for annotation_file in annotations_list: 3039 3040 # Init 3041 annotations = annotations_list.get(annotation_file, None) 3042 3043 # Annotation snpEff 3044 if annotation_file.startswith("snpeff"): 3045 3046 log.debug(f"Quick Annotation snpEff") 3047 3048 if "snpeff" not in param["annotation"]: 3049 param["annotation"]["snpeff"] = {} 3050 3051 if "options" not in param["annotation"]["snpeff"]: 3052 param["annotation"]["snpeff"]["options"] = "" 3053 3054 # snpEff options in annotations 3055 param["annotation"]["snpeff"]["options"] = "".join( 3056 annotation_file.split(":")[1:] 3057 ) 3058 3059 # Annotation Annovar 3060 elif annotation_file.startswith("annovar"): 3061 3062 log.debug(f"Quick Annotation Annovar") 3063 3064 if "annovar" not in param["annotation"]: 3065 param["annotation"]["annovar"] = {} 3066 3067 if "annotations" not in param["annotation"]["annovar"]: 3068 param["annotation"]["annovar"]["annotations"] = {} 3069 3070 # Options 3071 annotation_file_split = annotation_file.split(":") 3072 for annotation_file_annotation in annotation_file_split[1:]: 3073 if annotation_file_annotation: 3074 param["annotation"]["annovar"]["annotations"][ 3075 annotation_file_annotation 3076 ] = annotations 3077 3078 # Annotation Exomiser 3079 elif annotation_file.startswith("exomiser"): 3080 3081 log.debug(f"Quick Annotation Exomiser") 3082 3083 param["annotation"]["exomiser"] = params_string_to_dict( 3084 annotation_file 3085 ) 3086 3087 # Annotation Splice 3088 elif annotation_file.startswith("splice"): 3089 3090 log.debug(f"Quick Annotation Splice") 3091 3092 param["annotation"]["splice"] = params_string_to_dict( 3093 annotation_file 3094 ) 3095 3096 # Annotation Parquet or BCFTOOLS 3097 else: 3098 3099 # Tools detection 3100 if annotation_file.startswith("bcftools:"): 3101 annotation_tool_initial = "bcftools" 3102 annotation_file = ":".join(annotation_file.split(":")[1:]) 3103 elif annotation_file.startswith("snpsift:"): 3104 annotation_tool_initial = "snpsift" 3105 annotation_file = ":".join(annotation_file.split(":")[1:]) 3106 elif annotation_file.startswith("bigwig:"): 3107 annotation_tool_initial = "bigwig" 3108 annotation_file = ":".join(annotation_file.split(":")[1:]) 3109 else: 3110 annotation_tool_initial = None 3111 3112 # list of files 3113 annotation_file_list = annotation_file.replace("+", ":").split( 3114 ":" 3115 ) 3116 3117 for annotation_file in annotation_file_list: 3118 3119 if annotation_file: 3120 3121 # Annotation tool initial 3122 annotation_tool = annotation_tool_initial 3123 3124 # Find file 3125 annotation_file_found = None 3126 3127 if os.path.exists(annotation_file): 3128 annotation_file_found = annotation_file 3129 elif os.path.exists(full_path(annotation_file)): 3130 annotation_file_found = full_path(annotation_file) 3131 else: 3132 # Find within assembly folders 3133 for annotations_database in annotations_databases: 3134 found_files = find_all( 3135 annotation_file, 3136 os.path.join( 3137 annotations_database, assembly 3138 ), 3139 ) 3140 if len(found_files) > 0: 3141 annotation_file_found = found_files[0] 3142 break 3143 if not annotation_file_found and not assembly: 3144 # Find within folders 3145 for ( 3146 annotations_database 3147 ) in annotations_databases: 3148 found_files = find_all( 3149 annotation_file, annotations_database 3150 ) 3151 if len(found_files) > 0: 3152 annotation_file_found = found_files[0] 3153 break 3154 log.debug( 3155 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3156 ) 3157 3158 # Full path 3159 annotation_file_found = full_path(annotation_file_found) 3160 3161 if annotation_file_found: 3162 3163 database = Database(database=annotation_file_found) 3164 quick_annotation_format = database.get_format() 3165 quick_annotation_is_compressed = ( 3166 database.is_compressed() 3167 ) 3168 quick_annotation_is_indexed = os.path.exists( 3169 f"{annotation_file_found}.tbi" 3170 ) 3171 bcftools_preference = False 3172 3173 # Check Annotation Tool 3174 if not annotation_tool: 3175 if ( 3176 bcftools_preference 3177 and quick_annotation_format 3178 in ["vcf", "bed"] 3179 and quick_annotation_is_compressed 3180 and quick_annotation_is_indexed 3181 ): 3182 annotation_tool = "bcftools" 3183 elif quick_annotation_format in [ 3184 "vcf", 3185 "bed", 3186 "tsv", 3187 "tsv", 3188 "csv", 3189 "json", 3190 "tbl", 3191 "parquet", 3192 "duckdb", 3193 ]: 3194 annotation_tool = "parquet" 3195 elif quick_annotation_format in ["bw"]: 3196 annotation_tool = "bigwig" 3197 else: 3198 log.error( 3199 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3200 ) 3201 raise ValueError( 3202 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3203 ) 3204 3205 log.debug( 3206 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3207 ) 3208 3209 # Annotation Tool dispatch 3210 if annotation_tool: 3211 if annotation_tool not in param["annotation"]: 3212 param["annotation"][annotation_tool] = {} 3213 if ( 3214 "annotations" 3215 not in param["annotation"][annotation_tool] 3216 ): 3217 param["annotation"][annotation_tool][ 3218 "annotations" 3219 ] = {} 3220 param["annotation"][annotation_tool][ 3221 "annotations" 3222 ][annotation_file_found] = annotations 3223 3224 else: 3225 log.warning( 3226 f"Quick Annotation File {annotation_file} does NOT exist" 3227 ) 3228 3229 self.set_param(param) 3230 3231 if param.get("annotation", None): 3232 log.info("Annotations") 3233 if param.get("annotation", {}).get("parquet", None): 3234 log.info("Annotations 'parquet'...") 3235 self.annotation_parquet() 3236 if param.get("annotation", {}).get("bcftools", None): 3237 log.info("Annotations 'bcftools'...") 3238 self.annotation_bcftools() 3239 if param.get("annotation", {}).get("snpsift", None): 3240 log.info("Annotations 'snpsift'...") 3241 self.annotation_snpsift() 3242 if param.get("annotation", {}).get("bigwig", None): 3243 log.info("Annotations 'bigwig'...") 3244 self.annotation_bigwig() 3245 if param.get("annotation", {}).get("annovar", None): 3246 log.info("Annotations 'annovar'...") 3247 self.annotation_annovar() 3248 if param.get("annotation", {}).get("snpeff", None): 3249 log.info("Annotations 'snpeff'...") 3250 self.annotation_snpeff() 3251 if param.get("annotation", {}).get("exomiser", None) is not None: 3252 log.info("Annotations 'exomiser'...") 3253 self.annotation_exomiser() 3254 if param.get("annotation", {}).get("splice", None) is not None: 3255 log.info("Annotations 'splice' ...") 3256 self.annotation_splice() 3257 3258 # Explode INFOS fields into table fields 3259 if self.get_explode_infos(): 3260 self.explode_infos( 3261 prefix=self.get_explode_infos_prefix(), 3262 fields=self.get_explode_infos_fields(), 3263 force=True, 3264 ) 3265 3266 def annotation_bigwig(self, threads: int = None) -> None: 3267 """ 3268 The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases. 3269 3270 :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the 3271 number of threads to be used for parallel processing during the annotation process. If the 3272 `threads` parameter is not provided, the method will attempt to determine the optimal number of 3273 threads to use based on the system configuration 3274 :type threads: int 3275 :return: True 3276 """ 3277 3278 # DEBUG 3279 log.debug("Start annotation with bigwig databases") 3280 3281 # # Threads 3282 # if not threads: 3283 # threads = self.get_threads() 3284 # log.debug("Threads: " + str(threads)) 3285 3286 # Config 3287 config = self.get_config() 3288 log.debug("Config: " + str(config)) 3289 3290 # Config - BCFTools databases folders 3291 databases_folders = set( 3292 self.get_config() 3293 .get("folders", {}) 3294 .get("databases", {}) 3295 .get("annotations", ["."]) 3296 + self.get_config() 3297 .get("folders", {}) 3298 .get("databases", {}) 3299 .get("bigwig", ["."]) 3300 ) 3301 log.debug("Databases annotations: " + str(databases_folders)) 3302 3303 # Param 3304 annotations = ( 3305 self.get_param() 3306 .get("annotation", {}) 3307 .get("bigwig", {}) 3308 .get("annotations", None) 3309 ) 3310 log.debug("Annotations: " + str(annotations)) 3311 3312 # Assembly 3313 assembly = self.get_param().get( 3314 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3315 ) 3316 3317 # Data 3318 table_variants = self.get_table_variants() 3319 3320 # Check if not empty 3321 log.debug("Check if not empty") 3322 sql_query_chromosomes = ( 3323 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3324 ) 3325 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3326 if not sql_query_chromosomes_df["count"][0]: 3327 log.info(f"VCF empty") 3328 return 3329 3330 # VCF header 3331 vcf_reader = self.get_header() 3332 log.debug("Initial header: " + str(vcf_reader.infos)) 3333 3334 # Existing annotations 3335 for vcf_annotation in self.get_header().infos: 3336 3337 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3338 log.debug( 3339 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3340 ) 3341 3342 if annotations: 3343 3344 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3345 3346 # Export VCF file 3347 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3348 3349 # annotation_bigwig_config 3350 annotation_bigwig_config_list = [] 3351 3352 for annotation in annotations: 3353 annotation_fields = annotations[annotation] 3354 3355 # Annotation Name 3356 annotation_name = os.path.basename(annotation) 3357 3358 if not annotation_fields: 3359 annotation_fields = {"INFO": None} 3360 3361 log.debug(f"Annotation '{annotation_name}'") 3362 log.debug( 3363 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3364 ) 3365 3366 # Create Database 3367 database = Database( 3368 database=annotation, 3369 databases_folders=databases_folders, 3370 assembly=assembly, 3371 ) 3372 3373 # Find files 3374 db_file = database.get_database() 3375 db_file = full_path(db_file) 3376 db_hdr_file = database.get_header_file() 3377 db_hdr_file = full_path(db_hdr_file) 3378 db_file_type = database.get_format() 3379 3380 # If db_file is http ? 3381 if database.get_database().startswith("http"): 3382 3383 # Datbase is HTTP URL 3384 db_file_is_http = True 3385 3386 # DB file keep as URL 3387 db_file = database.get_database() 3388 log.warning( 3389 f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)" 3390 ) 3391 3392 # Retrieve automatic annotation field name 3393 annotation_field = clean_annotation_field( 3394 os.path.basename(db_file).replace(".bw", "") 3395 ) 3396 log.debug( 3397 f"Create header file with annotation field '{annotation_field}' is an HTTP URL" 3398 ) 3399 3400 # Create automatic header file 3401 db_hdr_file = os.path.join(tmp_dir, "header.hdr") 3402 with open(db_hdr_file, "w") as f: 3403 f.write("##fileformat=VCFv4.2\n") 3404 f.write( 3405 f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""" 3406 ) 3407 f.write(f"#CHROM START END {annotation_field}\n") 3408 3409 else: 3410 3411 # Datbase is NOT HTTP URL 3412 db_file_is_http = False 3413 3414 # Check index - try to create if not exists 3415 if ( 3416 db_file is None 3417 or db_hdr_file is None 3418 or (not os.path.exists(db_file) and not db_file_is_http) 3419 or not os.path.exists(db_hdr_file) 3420 or not db_file_type in ["bw"] 3421 ): 3422 # if False: 3423 log.error("Annotation failed: database not valid") 3424 log.error(f"Annotation annotation file: {db_file}") 3425 log.error(f"Annotation annotation file type: {db_file_type}") 3426 log.error(f"Annotation annotation header: {db_hdr_file}") 3427 raise ValueError( 3428 f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}" 3429 ) 3430 else: 3431 3432 # Log 3433 log.debug( 3434 f"Annotation '{annotation}' - file: " 3435 + str(db_file) 3436 + " and " 3437 + str(db_hdr_file) 3438 ) 3439 3440 # Load header as VCF object 3441 db_hdr_vcf = Variants(input=db_hdr_file) 3442 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3443 log.debug( 3444 "Annotation database header: " 3445 + str(db_hdr_vcf_header_infos) 3446 ) 3447 3448 # For all fields in database 3449 annotation_fields_full = False 3450 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3451 annotation_fields = { 3452 key: key for key in db_hdr_vcf_header_infos 3453 } 3454 log.debug( 3455 "Annotation database header - All annotations added: " 3456 + str(annotation_fields) 3457 ) 3458 annotation_fields_full = True 3459 3460 # Init 3461 cyvcf2_header_rename_dict = {} 3462 cyvcf2_header_list = [] 3463 cyvcf2_header_indexes = {} 3464 3465 # process annotation fields 3466 for annotation_field in annotation_fields: 3467 3468 # New annotation name 3469 annotation_field_new = annotation_fields[annotation_field] 3470 3471 # Check annotation field and index in header 3472 if ( 3473 annotation_field 3474 in db_hdr_vcf.get_header_columns_as_list() 3475 ): 3476 annotation_field_index = ( 3477 db_hdr_vcf.get_header_columns_as_list().index( 3478 annotation_field 3479 ) 3480 - 3 3481 ) 3482 cyvcf2_header_indexes[annotation_field_new] = ( 3483 annotation_field_index 3484 ) 3485 else: 3486 msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'" 3487 log.error(msg_err) 3488 raise ValueError(msg_err) 3489 3490 # Append annotation field in cyvcf2 header list 3491 cyvcf2_header_rename_dict[annotation_field_new] = ( 3492 db_hdr_vcf_header_infos[annotation_field].id 3493 ) 3494 cyvcf2_header_list.append( 3495 { 3496 "ID": annotation_field_new, 3497 "Number": db_hdr_vcf_header_infos[ 3498 annotation_field 3499 ].num, 3500 "Type": db_hdr_vcf_header_infos[ 3501 annotation_field 3502 ].type, 3503 "Description": db_hdr_vcf_header_infos[ 3504 annotation_field 3505 ].desc, 3506 } 3507 ) 3508 3509 # Add header on VCF 3510 vcf_reader.infos[annotation_field_new] = vcf.parser._Info( 3511 annotation_field_new, 3512 db_hdr_vcf_header_infos[annotation_field].num, 3513 db_hdr_vcf_header_infos[annotation_field].type, 3514 db_hdr_vcf_header_infos[annotation_field].desc, 3515 "HOWARD BigWig annotation", 3516 "unknown", 3517 self.code_type_map[ 3518 db_hdr_vcf_header_infos[annotation_field].type 3519 ], 3520 ) 3521 3522 # Load bigwig database 3523 bw_db = pyBigWig.open(db_file) 3524 if bw_db.isBigWig(): 3525 log.debug(f"Database '{db_file}' is in 'BigWig' format") 3526 else: 3527 msg_err = f"Database '{db_file}' is NOT in 'BigWig' format" 3528 log.error(msg_err) 3529 raise ValueError(msg_err) 3530 3531 annotation_bigwig_config_list.append( 3532 { 3533 "db_file": db_file, 3534 "bw_db": bw_db, 3535 "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict, 3536 "cyvcf2_header_list": cyvcf2_header_list, 3537 "cyvcf2_header_indexes": cyvcf2_header_indexes, 3538 } 3539 ) 3540 3541 # Annotate 3542 if annotation_bigwig_config_list: 3543 3544 # Annotation config 3545 log.debug( 3546 f"annotation_bigwig_config={annotation_bigwig_config_list}" 3547 ) 3548 3549 # Export VCF file 3550 self.export_variant_vcf( 3551 vcf_file=tmp_vcf_name, 3552 remove_info=True, 3553 add_samples=False, 3554 index=True, 3555 ) 3556 3557 # Load input tmp file 3558 input_vcf = cyvcf2.VCF(tmp_vcf_name) 3559 3560 # Add header in input file 3561 for annotation_bigwig_config in annotation_bigwig_config_list: 3562 for cyvcf2_header_field in annotation_bigwig_config.get( 3563 "cyvcf2_header_list", [] 3564 ): 3565 log.info( 3566 f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'" 3567 ) 3568 input_vcf.add_info_to_header(cyvcf2_header_field) 3569 3570 # Create output VCF file 3571 output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz") 3572 output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf) 3573 3574 # Fetch variants 3575 log.info(f"Annotations 'bigwig' start...") 3576 for variant in input_vcf: 3577 3578 for annotation_bigwig_config in annotation_bigwig_config_list: 3579 3580 # DB and indexes 3581 bw_db = annotation_bigwig_config.get("bw_db", None) 3582 cyvcf2_header_indexes = annotation_bigwig_config.get( 3583 "cyvcf2_header_indexes", None 3584 ) 3585 3586 # Retrieve value from chrom pos 3587 res = bw_db.values( 3588 variant.CHROM, variant.POS - 1, variant.POS 3589 ) 3590 3591 # For each annotation fields (and indexes) 3592 for cyvcf2_header_index in cyvcf2_header_indexes: 3593 3594 # If value is NOT nNone 3595 if not np.isnan( 3596 res[cyvcf2_header_indexes[cyvcf2_header_index]] 3597 ): 3598 variant.INFO[cyvcf2_header_index] = res[ 3599 cyvcf2_header_indexes[cyvcf2_header_index] 3600 ] 3601 3602 # Add record in output file 3603 output_vcf.write_record(variant) 3604 3605 # Log 3606 log.debug(f"Annotation done.") 3607 3608 # Close and write file 3609 log.info(f"Annotations 'bigwig' write...") 3610 output_vcf.close() 3611 log.debug(f"Write done.") 3612 3613 # Update variants 3614 log.info(f"Annotations 'bigwig' update...") 3615 self.update_from_vcf(output_vcf_file) 3616 log.debug(f"Update done.") 3617 3618 return True 3619 3620 def annotation_snpsift(self, threads: int = None) -> None: 3621 """ 3622 This function annotate with bcftools 3623 3624 :param threads: Number of threads to use 3625 :return: the value of the variable "return_value". 3626 """ 3627 3628 # DEBUG 3629 log.debug("Start annotation with bcftools databases") 3630 3631 # Threads 3632 if not threads: 3633 threads = self.get_threads() 3634 log.debug("Threads: " + str(threads)) 3635 3636 # Config 3637 config = self.get_config() 3638 log.debug("Config: " + str(config)) 3639 3640 # Config - snpSift 3641 snpsift_bin_command = get_bin_command( 3642 bin="SnpSift.jar", 3643 tool="snpsift", 3644 bin_type="jar", 3645 config=config, 3646 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3647 ) 3648 if not snpsift_bin_command: 3649 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3650 log.error(msg_err) 3651 raise ValueError(msg_err) 3652 3653 # Config - bcftools 3654 bcftools_bin_command = get_bin_command( 3655 bin="bcftools", 3656 tool="bcftools", 3657 bin_type="bin", 3658 config=config, 3659 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3660 ) 3661 if not bcftools_bin_command: 3662 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3663 log.error(msg_err) 3664 raise ValueError(msg_err) 3665 3666 # Config - BCFTools databases folders 3667 databases_folders = set( 3668 self.get_config() 3669 .get("folders", {}) 3670 .get("databases", {}) 3671 .get("annotations", ["."]) 3672 + self.get_config() 3673 .get("folders", {}) 3674 .get("databases", {}) 3675 .get("bcftools", ["."]) 3676 ) 3677 log.debug("Databases annotations: " + str(databases_folders)) 3678 3679 # Param 3680 annotations = ( 3681 self.get_param() 3682 .get("annotation", {}) 3683 .get("snpsift", {}) 3684 .get("annotations", None) 3685 ) 3686 log.debug("Annotations: " + str(annotations)) 3687 3688 # Assembly 3689 assembly = self.get_param().get( 3690 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3691 ) 3692 3693 # Data 3694 table_variants = self.get_table_variants() 3695 3696 # Check if not empty 3697 log.debug("Check if not empty") 3698 sql_query_chromosomes = ( 3699 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3700 ) 3701 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3702 if not sql_query_chromosomes_df["count"][0]: 3703 log.info(f"VCF empty") 3704 return 3705 3706 # VCF header 3707 vcf_reader = self.get_header() 3708 log.debug("Initial header: " + str(vcf_reader.infos)) 3709 3710 # Existing annotations 3711 for vcf_annotation in self.get_header().infos: 3712 3713 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3714 log.debug( 3715 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3716 ) 3717 3718 if annotations: 3719 3720 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3721 3722 # Export VCF file 3723 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3724 3725 # Init 3726 commands = {} 3727 3728 for annotation in annotations: 3729 annotation_fields = annotations[annotation] 3730 3731 # Annotation Name 3732 annotation_name = os.path.basename(annotation) 3733 3734 if not annotation_fields: 3735 annotation_fields = {"INFO": None} 3736 3737 log.debug(f"Annotation '{annotation_name}'") 3738 log.debug( 3739 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3740 ) 3741 3742 # Create Database 3743 database = Database( 3744 database=annotation, 3745 databases_folders=databases_folders, 3746 assembly=assembly, 3747 ) 3748 3749 # Find files 3750 db_file = database.get_database() 3751 db_file = full_path(db_file) 3752 db_hdr_file = database.get_header_file() 3753 db_hdr_file = full_path(db_hdr_file) 3754 db_file_type = database.get_format() 3755 db_tbi_file = f"{db_file}.tbi" 3756 db_file_compressed = database.is_compressed() 3757 3758 # Check if compressed 3759 if not db_file_compressed: 3760 log.error( 3761 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3762 ) 3763 raise ValueError( 3764 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3765 ) 3766 3767 # Check if indexed 3768 if not os.path.exists(db_tbi_file): 3769 log.error( 3770 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3771 ) 3772 raise ValueError( 3773 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3774 ) 3775 3776 # Check index - try to create if not exists 3777 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3778 log.error("Annotation failed: database not valid") 3779 log.error(f"Annotation annotation file: {db_file}") 3780 log.error(f"Annotation annotation header: {db_hdr_file}") 3781 log.error(f"Annotation annotation index: {db_tbi_file}") 3782 raise ValueError( 3783 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3784 ) 3785 else: 3786 3787 log.debug( 3788 f"Annotation '{annotation}' - file: " 3789 + str(db_file) 3790 + " and " 3791 + str(db_hdr_file) 3792 ) 3793 3794 # Load header as VCF object 3795 db_hdr_vcf = Variants(input=db_hdr_file) 3796 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3797 log.debug( 3798 "Annotation database header: " 3799 + str(db_hdr_vcf_header_infos) 3800 ) 3801 3802 # For all fields in database 3803 annotation_fields_full = False 3804 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3805 annotation_fields = { 3806 key: key for key in db_hdr_vcf_header_infos 3807 } 3808 log.debug( 3809 "Annotation database header - All annotations added: " 3810 + str(annotation_fields) 3811 ) 3812 annotation_fields_full = True 3813 3814 # # Create file for field rename 3815 # log.debug("Create file for field rename") 3816 # tmp_rename = NamedTemporaryFile( 3817 # prefix=self.get_prefix(), 3818 # dir=self.get_tmp_dir(), 3819 # suffix=".rename", 3820 # delete=False, 3821 # ) 3822 # tmp_rename_name = tmp_rename.name 3823 # tmp_files.append(tmp_rename_name) 3824 3825 # Number of fields 3826 nb_annotation_field = 0 3827 annotation_list = [] 3828 annotation_infos_rename_list = [] 3829 3830 for annotation_field in annotation_fields: 3831 3832 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3833 annotation_fields_new_name = annotation_fields.get( 3834 annotation_field, annotation_field 3835 ) 3836 if not annotation_fields_new_name: 3837 annotation_fields_new_name = annotation_field 3838 3839 # Check if field is in DB and if field is not elready in input data 3840 if ( 3841 annotation_field in db_hdr_vcf.get_header().infos 3842 and annotation_fields_new_name 3843 not in self.get_header().infos 3844 ): 3845 3846 log.info( 3847 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3848 ) 3849 3850 # BCFTools annotate param to rename fields 3851 if annotation_field != annotation_fields_new_name: 3852 annotation_infos_rename_list.append( 3853 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3854 ) 3855 3856 # Add INFO field to header 3857 db_hdr_vcf_header_infos_number = ( 3858 db_hdr_vcf_header_infos[annotation_field].num or "." 3859 ) 3860 db_hdr_vcf_header_infos_type = ( 3861 db_hdr_vcf_header_infos[annotation_field].type 3862 or "String" 3863 ) 3864 db_hdr_vcf_header_infos_description = ( 3865 db_hdr_vcf_header_infos[annotation_field].desc 3866 or f"{annotation_field} description" 3867 ) 3868 db_hdr_vcf_header_infos_source = ( 3869 db_hdr_vcf_header_infos[annotation_field].source 3870 or "unknown" 3871 ) 3872 db_hdr_vcf_header_infos_version = ( 3873 db_hdr_vcf_header_infos[annotation_field].version 3874 or "unknown" 3875 ) 3876 3877 vcf_reader.infos[annotation_fields_new_name] = ( 3878 vcf.parser._Info( 3879 annotation_fields_new_name, 3880 db_hdr_vcf_header_infos_number, 3881 db_hdr_vcf_header_infos_type, 3882 db_hdr_vcf_header_infos_description, 3883 db_hdr_vcf_header_infos_source, 3884 db_hdr_vcf_header_infos_version, 3885 self.code_type_map[ 3886 db_hdr_vcf_header_infos_type 3887 ], 3888 ) 3889 ) 3890 3891 annotation_list.append(annotation_field) 3892 3893 nb_annotation_field += 1 3894 3895 else: 3896 3897 if ( 3898 annotation_field 3899 not in db_hdr_vcf.get_header().infos 3900 ): 3901 log.warning( 3902 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3903 ) 3904 if ( 3905 annotation_fields_new_name 3906 in self.get_header().infos 3907 ): 3908 log.warning( 3909 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3910 ) 3911 3912 log.info( 3913 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3914 ) 3915 3916 annotation_infos = ",".join(annotation_list) 3917 3918 if annotation_infos != "": 3919 3920 # Annotated VCF (and error file) 3921 tmp_annotation_vcf_name = os.path.join( 3922 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3923 ) 3924 tmp_annotation_vcf_name_err = ( 3925 tmp_annotation_vcf_name + ".err" 3926 ) 3927 3928 # Add fields to annotate 3929 if not annotation_fields_full: 3930 annotation_infos_option = f"-info {annotation_infos}" 3931 else: 3932 annotation_infos_option = "" 3933 3934 # Info fields rename 3935 if annotation_infos_rename_list: 3936 annotation_infos_rename = " -c " + ",".join( 3937 annotation_infos_rename_list 3938 ) 3939 else: 3940 annotation_infos_rename = "" 3941 3942 # Annotate command 3943 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3944 3945 # Add command 3946 commands[command_annotate] = tmp_annotation_vcf_name 3947 3948 if commands: 3949 3950 # Export VCF file 3951 self.export_variant_vcf( 3952 vcf_file=tmp_vcf_name, 3953 remove_info=True, 3954 add_samples=False, 3955 index=True, 3956 ) 3957 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3958 3959 # Num command 3960 nb_command = 0 3961 3962 # Annotate 3963 for command_annotate in commands: 3964 nb_command += 1 3965 log.info( 3966 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3967 ) 3968 log.debug(f"command_annotate={command_annotate}") 3969 run_parallel_commands([command_annotate], threads) 3970 3971 # Debug 3972 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3973 3974 # Update variants 3975 log.info( 3976 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3977 ) 3978 self.update_from_vcf(commands[command_annotate]) 3979 3980 def annotation_bcftools(self, threads: int = None) -> None: 3981 """ 3982 This function annotate with bcftools 3983 3984 :param threads: Number of threads to use 3985 :return: the value of the variable "return_value". 3986 """ 3987 3988 # DEBUG 3989 log.debug("Start annotation with bcftools databases") 3990 3991 # Threads 3992 if not threads: 3993 threads = self.get_threads() 3994 log.debug("Threads: " + str(threads)) 3995 3996 # Config 3997 config = self.get_config() 3998 log.debug("Config: " + str(config)) 3999 4000 # DEBUG 4001 delete_tmp = True 4002 if self.get_config().get("verbosity", "warning") in ["debug"]: 4003 delete_tmp = False 4004 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4005 4006 # Config - BCFTools bin command 4007 bcftools_bin_command = get_bin_command( 4008 bin="bcftools", 4009 tool="bcftools", 4010 bin_type="bin", 4011 config=config, 4012 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 4013 ) 4014 if not bcftools_bin_command: 4015 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 4016 log.error(msg_err) 4017 raise ValueError(msg_err) 4018 4019 # Config - BCFTools databases folders 4020 databases_folders = set( 4021 self.get_config() 4022 .get("folders", {}) 4023 .get("databases", {}) 4024 .get("annotations", ["."]) 4025 + self.get_config() 4026 .get("folders", {}) 4027 .get("databases", {}) 4028 .get("bcftools", ["."]) 4029 ) 4030 log.debug("Databases annotations: " + str(databases_folders)) 4031 4032 # Param 4033 annotations = ( 4034 self.get_param() 4035 .get("annotation", {}) 4036 .get("bcftools", {}) 4037 .get("annotations", None) 4038 ) 4039 log.debug("Annotations: " + str(annotations)) 4040 4041 # Assembly 4042 assembly = self.get_param().get( 4043 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 4044 ) 4045 4046 # Data 4047 table_variants = self.get_table_variants() 4048 4049 # Check if not empty 4050 log.debug("Check if not empty") 4051 sql_query_chromosomes = ( 4052 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4053 ) 4054 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 4055 if not sql_query_chromosomes_df["count"][0]: 4056 log.info(f"VCF empty") 4057 return 4058 4059 # Export in VCF 4060 log.debug("Create initial file to annotate") 4061 tmp_vcf = NamedTemporaryFile( 4062 prefix=self.get_prefix(), 4063 dir=self.get_tmp_dir(), 4064 suffix=".vcf.gz", 4065 delete=False, 4066 ) 4067 tmp_vcf_name = tmp_vcf.name 4068 4069 # VCF header 4070 vcf_reader = self.get_header() 4071 log.debug("Initial header: " + str(vcf_reader.infos)) 4072 4073 # Existing annotations 4074 for vcf_annotation in self.get_header().infos: 4075 4076 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4077 log.debug( 4078 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4079 ) 4080 4081 if annotations: 4082 4083 tmp_ann_vcf_list = [] 4084 commands = [] 4085 tmp_files = [] 4086 err_files = [] 4087 4088 for annotation in annotations: 4089 annotation_fields = annotations[annotation] 4090 4091 # Annotation Name 4092 annotation_name = os.path.basename(annotation) 4093 4094 if not annotation_fields: 4095 annotation_fields = {"INFO": None} 4096 4097 log.debug(f"Annotation '{annotation_name}'") 4098 log.debug( 4099 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 4100 ) 4101 4102 # Create Database 4103 database = Database( 4104 database=annotation, 4105 databases_folders=databases_folders, 4106 assembly=assembly, 4107 ) 4108 4109 # Find files 4110 db_file = database.get_database() 4111 db_file = full_path(db_file) 4112 db_hdr_file = database.get_header_file() 4113 db_hdr_file = full_path(db_hdr_file) 4114 db_file_type = database.get_format() 4115 db_tbi_file = f"{db_file}.tbi" 4116 db_file_compressed = database.is_compressed() 4117 4118 # Check if compressed 4119 if not db_file_compressed: 4120 log.error( 4121 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4122 ) 4123 raise ValueError( 4124 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4125 ) 4126 4127 # Check if indexed 4128 if not os.path.exists(db_tbi_file): 4129 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 4130 raise ValueError( 4131 f"Annotation '{annotation}' - {db_file} NOT indexed file" 4132 ) 4133 4134 # Check index - try to create if not exists 4135 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 4136 log.error("Annotation failed: database not valid") 4137 log.error(f"Annotation annotation file: {db_file}") 4138 log.error(f"Annotation annotation header: {db_hdr_file}") 4139 log.error(f"Annotation annotation index: {db_tbi_file}") 4140 raise ValueError( 4141 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 4142 ) 4143 else: 4144 4145 log.debug( 4146 f"Annotation '{annotation}' - file: " 4147 + str(db_file) 4148 + " and " 4149 + str(db_hdr_file) 4150 ) 4151 4152 # Load header as VCF object 4153 db_hdr_vcf = Variants(input=db_hdr_file) 4154 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 4155 log.debug( 4156 "Annotation database header: " + str(db_hdr_vcf_header_infos) 4157 ) 4158 4159 # For all fields in database 4160 if "ALL" in annotation_fields or "INFO" in annotation_fields: 4161 annotation_fields = { 4162 key: key for key in db_hdr_vcf_header_infos 4163 } 4164 log.debug( 4165 "Annotation database header - All annotations added: " 4166 + str(annotation_fields) 4167 ) 4168 4169 # Number of fields 4170 nb_annotation_field = 0 4171 annotation_list = [] 4172 4173 for annotation_field in annotation_fields: 4174 4175 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 4176 annotation_fields_new_name = annotation_fields.get( 4177 annotation_field, annotation_field 4178 ) 4179 if not annotation_fields_new_name: 4180 annotation_fields_new_name = annotation_field 4181 4182 # Check if field is in DB and if field is not elready in input data 4183 if ( 4184 annotation_field in db_hdr_vcf.get_header().infos 4185 and annotation_fields_new_name 4186 not in self.get_header().infos 4187 ): 4188 4189 log.info( 4190 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 4191 ) 4192 4193 # Add INFO field to header 4194 db_hdr_vcf_header_infos_number = ( 4195 db_hdr_vcf_header_infos[annotation_field].num or "." 4196 ) 4197 db_hdr_vcf_header_infos_type = ( 4198 db_hdr_vcf_header_infos[annotation_field].type 4199 or "String" 4200 ) 4201 db_hdr_vcf_header_infos_description = ( 4202 db_hdr_vcf_header_infos[annotation_field].desc 4203 or f"{annotation_field} description" 4204 ) 4205 db_hdr_vcf_header_infos_source = ( 4206 db_hdr_vcf_header_infos[annotation_field].source 4207 or "unknown" 4208 ) 4209 db_hdr_vcf_header_infos_version = ( 4210 db_hdr_vcf_header_infos[annotation_field].version 4211 or "unknown" 4212 ) 4213 4214 vcf_reader.infos[annotation_fields_new_name] = ( 4215 vcf.parser._Info( 4216 annotation_fields_new_name, 4217 db_hdr_vcf_header_infos_number, 4218 db_hdr_vcf_header_infos_type, 4219 db_hdr_vcf_header_infos_description, 4220 db_hdr_vcf_header_infos_source, 4221 db_hdr_vcf_header_infos_version, 4222 self.code_type_map[db_hdr_vcf_header_infos_type], 4223 ) 4224 ) 4225 4226 # annotation_list.append(annotation_field) 4227 if annotation_field != annotation_fields_new_name: 4228 annotation_list.append( 4229 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 4230 ) 4231 else: 4232 annotation_list.append(annotation_field) 4233 4234 nb_annotation_field += 1 4235 4236 else: 4237 4238 if annotation_field not in db_hdr_vcf.get_header().infos: 4239 log.warning( 4240 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 4241 ) 4242 if annotation_fields_new_name in self.get_header().infos: 4243 log.warning( 4244 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 4245 ) 4246 4247 log.info( 4248 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 4249 ) 4250 4251 annotation_infos = ",".join(annotation_list) 4252 4253 if annotation_infos != "": 4254 4255 # Protect header for bcftools (remove "#CHROM" and variants line) 4256 log.debug("Protect Header file - remove #CHROM line if exists") 4257 tmp_header_vcf = NamedTemporaryFile( 4258 prefix=self.get_prefix(), 4259 dir=self.get_tmp_dir(), 4260 suffix=".hdr", 4261 delete=False, 4262 ) 4263 tmp_header_vcf_name = tmp_header_vcf.name 4264 tmp_files.append(tmp_header_vcf_name) 4265 # Command 4266 if db_hdr_file.endswith(".gz"): 4267 command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4268 else: 4269 command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4270 # Run 4271 run_parallel_commands([command_extract_header], 1) 4272 4273 # Find chomosomes 4274 log.debug("Find chromosomes ") 4275 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 4276 sql_query_chromosomes_df = self.get_query_to_df( 4277 sql_query_chromosomes 4278 ) 4279 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 4280 4281 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 4282 4283 # BED columns in the annotation file 4284 if db_file_type in ["bed"]: 4285 annotation_infos = "CHROM,POS,POS," + annotation_infos 4286 4287 for chrom in chomosomes_list: 4288 4289 # Create BED on initial VCF 4290 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 4291 tmp_bed = NamedTemporaryFile( 4292 prefix=self.get_prefix(), 4293 dir=self.get_tmp_dir(), 4294 suffix=".bed", 4295 delete=False, 4296 ) 4297 tmp_bed_name = tmp_bed.name 4298 tmp_files.append(tmp_bed_name) 4299 4300 # Detecte regions 4301 log.debug( 4302 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 4303 ) 4304 window = 1000000 4305 sql_query_intervals_for_bed = f""" 4306 SELECT \"#CHROM\", 4307 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 4308 \"POS\"+{window} 4309 FROM {table_variants} as table_variants 4310 WHERE table_variants.\"#CHROM\" = '{chrom}' 4311 """ 4312 regions = self.conn.execute( 4313 sql_query_intervals_for_bed 4314 ).fetchall() 4315 merged_regions = merge_regions(regions) 4316 log.debug( 4317 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 4318 ) 4319 4320 header = ["#CHROM", "START", "END"] 4321 with open(tmp_bed_name, "w") as f: 4322 # Write the header with tab delimiter 4323 f.write("\t".join(header) + "\n") 4324 for d in merged_regions: 4325 # Write each data row with tab delimiter 4326 f.write("\t".join(map(str, d)) + "\n") 4327 4328 # Tmp files 4329 tmp_annotation_vcf = NamedTemporaryFile( 4330 prefix=self.get_prefix(), 4331 dir=self.get_tmp_dir(), 4332 suffix=".vcf.gz", 4333 delete=False, 4334 ) 4335 tmp_annotation_vcf_name = tmp_annotation_vcf.name 4336 tmp_files.append(tmp_annotation_vcf_name) 4337 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 4338 tmp_annotation_vcf_name_err = ( 4339 tmp_annotation_vcf_name + ".err" 4340 ) 4341 err_files.append(tmp_annotation_vcf_name_err) 4342 4343 # Annotate Command 4344 log.debug( 4345 f"Annotation '{annotation}' - add bcftools command" 4346 ) 4347 4348 # Command 4349 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 4350 4351 # Add command 4352 commands.append(command_annotate) 4353 4354 # if some commands 4355 if commands: 4356 4357 # Export VCF file 4358 self.export_variant_vcf( 4359 vcf_file=tmp_vcf_name, 4360 remove_info=True, 4361 add_samples=False, 4362 index=True, 4363 ) 4364 4365 # Threads 4366 # calculate threads for annotated commands 4367 if commands: 4368 threads_bcftools_annotate = round(threads / len(commands)) 4369 else: 4370 threads_bcftools_annotate = 1 4371 4372 if not threads_bcftools_annotate: 4373 threads_bcftools_annotate = 1 4374 4375 # Add threads option to bcftools commands 4376 if threads_bcftools_annotate > 1: 4377 commands_threaded = [] 4378 for command in commands: 4379 commands_threaded.append( 4380 command.replace( 4381 f"{bcftools_bin_command} annotate ", 4382 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 4383 ) 4384 ) 4385 commands = commands_threaded 4386 4387 # Command annotation multithreading 4388 log.debug(f"Annotation - Annotation commands: " + str(commands)) 4389 log.info( 4390 f"Annotation - Annotation multithreaded in " 4391 + str(len(commands)) 4392 + " commands" 4393 ) 4394 4395 run_parallel_commands(commands, threads) 4396 4397 # Merge 4398 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4399 4400 if tmp_ann_vcf_list_cmd: 4401 4402 # Tmp file 4403 tmp_annotate_vcf = NamedTemporaryFile( 4404 prefix=self.get_prefix(), 4405 dir=self.get_tmp_dir(), 4406 suffix=".vcf.gz", 4407 delete=True, 4408 ) 4409 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4410 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4411 err_files.append(tmp_annotate_vcf_name_err) 4412 4413 # Tmp file remove command 4414 tmp_files_remove_command = "" 4415 if tmp_files: 4416 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4417 4418 # Command merge 4419 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4420 log.info( 4421 f"Annotation - Annotation merging " 4422 + str(len(commands)) 4423 + " annotated files" 4424 ) 4425 log.debug(f"Annotation - merge command: {merge_command}") 4426 run_parallel_commands([merge_command], 1) 4427 4428 # Error messages 4429 log.info(f"Error/Warning messages:") 4430 error_message_command_all = [] 4431 error_message_command_warning = [] 4432 error_message_command_err = [] 4433 for err_file in err_files: 4434 with open(err_file, "r") as f: 4435 for line in f: 4436 message = line.strip() 4437 error_message_command_all.append(message) 4438 if line.startswith("[W::"): 4439 error_message_command_warning.append(message) 4440 if line.startswith("[E::"): 4441 error_message_command_err.append( 4442 f"{err_file}: " + message 4443 ) 4444 # log info 4445 for message in list( 4446 set(error_message_command_err + error_message_command_warning) 4447 ): 4448 log.info(f" {message}") 4449 # debug info 4450 for message in list(set(error_message_command_all)): 4451 log.debug(f" {message}") 4452 # failed 4453 if len(error_message_command_err): 4454 log.error("Annotation failed: Error in commands") 4455 raise ValueError("Annotation failed: Error in commands") 4456 4457 # Update variants 4458 log.info(f"Annotation - Updating...") 4459 self.update_from_vcf(tmp_annotate_vcf_name) 4460 4461 def annotation_exomiser(self, threads: int = None) -> None: 4462 """ 4463 This function annotate with Exomiser 4464 4465 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4466 - "analysis" (dict/file): 4467 Full analysis dictionnary parameters (see Exomiser docs). 4468 Either a dict, or a file in JSON or YAML format. 4469 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4470 Default : None 4471 - "preset" (string): 4472 Analysis preset (available in config folder). 4473 Used if no full "analysis" is provided. 4474 Default: "exome" 4475 - "phenopacket" (dict/file): 4476 Samples and phenotipic features parameters (see Exomiser docs). 4477 Either a dict, or a file in JSON or YAML format. 4478 Default: None 4479 - "subject" (dict): 4480 Sample parameters (see Exomiser docs). 4481 Example: 4482 "subject": 4483 { 4484 "id": "ISDBM322017", 4485 "sex": "FEMALE" 4486 } 4487 Default: None 4488 - "sample" (string): 4489 Sample name to construct "subject" section: 4490 "subject": 4491 { 4492 "id": "<sample>", 4493 "sex": "UNKNOWN_SEX" 4494 } 4495 Default: None 4496 - "phenotypicFeatures" (dict) 4497 Phenotypic features to construct "subject" section. 4498 Example: 4499 "phenotypicFeatures": 4500 [ 4501 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4502 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4503 ] 4504 - "hpo" (list) 4505 List of HPO ids as phenotypic features. 4506 Example: 4507 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4508 Default: [] 4509 - "outputOptions" (dict): 4510 Output options (see Exomiser docs). 4511 Default: 4512 "output_options" = 4513 { 4514 "outputContributingVariantsOnly": False, 4515 "numGenes": 0, 4516 "outputFormats": ["TSV_VARIANT", "VCF"] 4517 } 4518 - "transcript_source" (string): 4519 Transcript source (either "refseq", "ucsc", "ensembl") 4520 Default: "refseq" 4521 - "exomiser_to_info" (boolean): 4522 Add exomiser TSV file columns as INFO fields in VCF. 4523 Default: False 4524 - "release" (string): 4525 Exomise database release. 4526 If not exists, database release will be downloaded (take a while). 4527 Default: None (provided by application.properties configuration file) 4528 - "exomiser_application_properties" (file): 4529 Exomiser configuration file (see Exomiser docs). 4530 Useful to automatically download databases (especially for specific genome databases). 4531 4532 Notes: 4533 - If no sample in parameters, first sample in VCF will be chosen 4534 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4535 4536 :param threads: The number of threads to use 4537 :return: None. 4538 """ 4539 4540 # DEBUG 4541 log.debug("Start annotation with Exomiser databases") 4542 4543 # Threads 4544 if not threads: 4545 threads = self.get_threads() 4546 log.debug("Threads: " + str(threads)) 4547 4548 # Config 4549 config = self.get_config() 4550 log.debug("Config: " + str(config)) 4551 4552 # Config - Folders - Databases 4553 databases_folders = ( 4554 config.get("folders", {}) 4555 .get("databases", {}) 4556 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4557 ) 4558 databases_folders = full_path(databases_folders) 4559 if not os.path.exists(databases_folders): 4560 log.error(f"Databases annotations: {databases_folders} NOT found") 4561 log.debug("Databases annotations: " + str(databases_folders)) 4562 4563 # Config - Exomiser 4564 exomiser_bin_command = get_bin_command( 4565 bin="exomiser-cli*.jar", 4566 tool="exomiser", 4567 bin_type="jar", 4568 config=config, 4569 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4570 ) 4571 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4572 if not exomiser_bin_command: 4573 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4574 log.error(msg_err) 4575 raise ValueError(msg_err) 4576 4577 # Param 4578 param = self.get_param() 4579 log.debug("Param: " + str(param)) 4580 4581 # Param - Exomiser 4582 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4583 log.debug(f"Param Exomiser: {param_exomiser}") 4584 4585 # Param - Assembly 4586 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4587 log.debug("Assembly: " + str(assembly)) 4588 4589 # Data 4590 table_variants = self.get_table_variants() 4591 4592 # Check if not empty 4593 log.debug("Check if not empty") 4594 sql_query_chromosomes = ( 4595 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4596 ) 4597 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4598 log.info(f"VCF empty") 4599 return False 4600 4601 # VCF header 4602 vcf_reader = self.get_header() 4603 log.debug("Initial header: " + str(vcf_reader.infos)) 4604 4605 # Samples 4606 samples = self.get_header_sample_list() 4607 if not samples: 4608 log.error("No Samples in VCF") 4609 return False 4610 log.debug(f"Samples: {samples}") 4611 4612 # Memory limit 4613 memory_limit = self.get_memory("8G") 4614 log.debug(f"memory_limit: {memory_limit}") 4615 4616 # Exomiser java options 4617 exomiser_java_options = ( 4618 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4619 ) 4620 log.debug(f"Exomiser java options: {exomiser_java_options}") 4621 4622 # Download Exomiser (if not exists) 4623 exomiser_release = param_exomiser.get("release", None) 4624 exomiser_application_properties = param_exomiser.get( 4625 "exomiser_application_properties", None 4626 ) 4627 databases_download_exomiser( 4628 assemblies=[assembly], 4629 exomiser_folder=databases_folders, 4630 exomiser_release=exomiser_release, 4631 exomiser_phenotype_release=exomiser_release, 4632 exomiser_application_properties=exomiser_application_properties, 4633 ) 4634 4635 # Force annotation 4636 force_update_annotation = True 4637 4638 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4639 log.debug("Start annotation Exomiser") 4640 4641 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4642 4643 # tmp_dir = "/tmp/exomiser" 4644 4645 ### ANALYSIS ### 4646 ################ 4647 4648 # Create analysis.json through analysis dict 4649 # either analysis in param or by default 4650 # depending on preset exome/genome) 4651 4652 # Init analysis dict 4653 param_exomiser_analysis_dict = {} 4654 4655 # analysis from param 4656 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4657 param_exomiser_analysis = full_path(param_exomiser_analysis) 4658 4659 # If analysis in param -> load anlaysis json 4660 if param_exomiser_analysis: 4661 4662 # If param analysis is a file and exists 4663 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4664 param_exomiser_analysis 4665 ): 4666 # Load analysis file into analysis dict (either yaml or json) 4667 with open(param_exomiser_analysis) as json_file: 4668 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4669 4670 # If param analysis is a dict 4671 elif isinstance(param_exomiser_analysis, dict): 4672 # Load analysis dict into analysis dict (either yaml or json) 4673 param_exomiser_analysis_dict = param_exomiser_analysis 4674 4675 # Error analysis type 4676 else: 4677 log.error(f"Analysis type unknown. Check param file.") 4678 raise ValueError(f"Analysis type unknown. Check param file.") 4679 4680 # Case no input analysis config file/dict 4681 # Use preset (exome/genome) to open default config file 4682 if not param_exomiser_analysis_dict: 4683 4684 # default preset 4685 default_preset = "exome" 4686 4687 # Get param preset or default preset 4688 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4689 4690 # Try to find if preset is a file 4691 if os.path.exists(param_exomiser_preset): 4692 # Preset file is provided in full path 4693 param_exomiser_analysis_default_config_file = ( 4694 param_exomiser_preset 4695 ) 4696 # elif os.path.exists(full_path(param_exomiser_preset)): 4697 # # Preset file is provided in full path 4698 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4699 elif os.path.exists( 4700 os.path.join(folder_config, param_exomiser_preset) 4701 ): 4702 # Preset file is provided a basename in config folder (can be a path with subfolders) 4703 param_exomiser_analysis_default_config_file = os.path.join( 4704 folder_config, param_exomiser_preset 4705 ) 4706 else: 4707 # Construct preset file 4708 param_exomiser_analysis_default_config_file = os.path.join( 4709 folder_config, 4710 f"preset-{param_exomiser_preset}-analysis.json", 4711 ) 4712 4713 # If preset file exists 4714 param_exomiser_analysis_default_config_file = full_path( 4715 param_exomiser_analysis_default_config_file 4716 ) 4717 if os.path.exists(param_exomiser_analysis_default_config_file): 4718 # Load prest file into analysis dict (either yaml or json) 4719 with open( 4720 param_exomiser_analysis_default_config_file 4721 ) as json_file: 4722 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4723 json_file 4724 ) 4725 4726 # Error preset file 4727 else: 4728 log.error( 4729 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4730 ) 4731 raise ValueError( 4732 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4733 ) 4734 4735 # If no analysis dict created 4736 if not param_exomiser_analysis_dict: 4737 log.error(f"No analysis config") 4738 raise ValueError(f"No analysis config") 4739 4740 # Log 4741 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4742 4743 ### PHENOPACKET ### 4744 ################### 4745 4746 # If no PhenoPacket in analysis dict -> check in param 4747 if "phenopacket" not in param_exomiser_analysis_dict: 4748 4749 # If PhenoPacket in param -> load anlaysis json 4750 if param_exomiser.get("phenopacket", None): 4751 4752 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4753 param_exomiser_phenopacket = full_path( 4754 param_exomiser_phenopacket 4755 ) 4756 4757 # If param phenopacket is a file and exists 4758 if isinstance( 4759 param_exomiser_phenopacket, str 4760 ) and os.path.exists(param_exomiser_phenopacket): 4761 # Load phenopacket file into analysis dict (either yaml or json) 4762 with open(param_exomiser_phenopacket) as json_file: 4763 param_exomiser_analysis_dict["phenopacket"] = ( 4764 yaml.safe_load(json_file) 4765 ) 4766 4767 # If param phenopacket is a dict 4768 elif isinstance(param_exomiser_phenopacket, dict): 4769 # Load phenopacket dict into analysis dict (either yaml or json) 4770 param_exomiser_analysis_dict["phenopacket"] = ( 4771 param_exomiser_phenopacket 4772 ) 4773 4774 # Error phenopacket type 4775 else: 4776 log.error(f"Phenopacket type unknown. Check param file.") 4777 raise ValueError( 4778 f"Phenopacket type unknown. Check param file." 4779 ) 4780 4781 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4782 if "phenopacket" not in param_exomiser_analysis_dict: 4783 4784 # Init PhenoPacket 4785 param_exomiser_analysis_dict["phenopacket"] = { 4786 "id": "analysis", 4787 "proband": {}, 4788 } 4789 4790 ### Add subject ### 4791 4792 # If subject exists 4793 param_exomiser_subject = param_exomiser.get("subject", {}) 4794 4795 # If subject not exists -> found sample ID 4796 if not param_exomiser_subject: 4797 4798 # Found sample ID in param 4799 sample = param_exomiser.get("sample", None) 4800 4801 # Find sample ID (first sample) 4802 if not sample: 4803 sample_list = self.get_header_sample_list() 4804 if len(sample_list) > 0: 4805 sample = sample_list[0] 4806 else: 4807 log.error(f"No sample found") 4808 raise ValueError(f"No sample found") 4809 4810 # Create subject 4811 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4812 4813 # Add to dict 4814 param_exomiser_analysis_dict["phenopacket"][ 4815 "subject" 4816 ] = param_exomiser_subject 4817 4818 ### Add "phenotypicFeatures" ### 4819 4820 # If phenotypicFeatures exists 4821 param_exomiser_phenotypicfeatures = param_exomiser.get( 4822 "phenotypicFeatures", [] 4823 ) 4824 4825 # If phenotypicFeatures not exists -> Try to infer from hpo list 4826 if not param_exomiser_phenotypicfeatures: 4827 4828 # Found HPO in param 4829 param_exomiser_hpo = param_exomiser.get("hpo", []) 4830 4831 # Split HPO if list in string format separated by comma 4832 if isinstance(param_exomiser_hpo, str): 4833 param_exomiser_hpo = param_exomiser_hpo.split(",") 4834 4835 # Create HPO list 4836 for hpo in param_exomiser_hpo: 4837 hpo_clean = re.sub("[^0-9]", "", hpo) 4838 param_exomiser_phenotypicfeatures.append( 4839 { 4840 "type": { 4841 "id": f"HP:{hpo_clean}", 4842 "label": f"HP:{hpo_clean}", 4843 } 4844 } 4845 ) 4846 4847 # Add to dict 4848 param_exomiser_analysis_dict["phenopacket"][ 4849 "phenotypicFeatures" 4850 ] = param_exomiser_phenotypicfeatures 4851 4852 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4853 if not param_exomiser_phenotypicfeatures: 4854 for step in param_exomiser_analysis_dict.get( 4855 "analysis", {} 4856 ).get("steps", []): 4857 if "hiPhivePrioritiser" in step: 4858 param_exomiser_analysis_dict.get("analysis", {}).get( 4859 "steps", [] 4860 ).remove(step) 4861 4862 ### Add Input File ### 4863 4864 # Initial file name and htsFiles 4865 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4866 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4867 { 4868 "uri": tmp_vcf_name, 4869 "htsFormat": "VCF", 4870 "genomeAssembly": assembly, 4871 } 4872 ] 4873 4874 ### Add metaData ### 4875 4876 # If metaData not in analysis dict 4877 if "metaData" not in param_exomiser_analysis_dict: 4878 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4879 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4880 "createdBy": "howard", 4881 "phenopacketSchemaVersion": 1, 4882 } 4883 4884 ### OutputOptions ### 4885 4886 # Init output result folder 4887 output_results = os.path.join(tmp_dir, "results") 4888 4889 # If no outputOptions in analysis dict 4890 if "outputOptions" not in param_exomiser_analysis_dict: 4891 4892 # default output formats 4893 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4894 4895 # Get outputOptions in param 4896 output_options = param_exomiser.get("outputOptions", None) 4897 4898 # If no output_options in param -> check 4899 if not output_options: 4900 output_options = { 4901 "outputContributingVariantsOnly": False, 4902 "numGenes": 0, 4903 "outputFormats": defaut_output_formats, 4904 } 4905 4906 # Replace outputDirectory in output options 4907 output_options["outputDirectory"] = output_results 4908 output_options["outputFileName"] = "howard" 4909 4910 # Add outputOptions in analysis dict 4911 param_exomiser_analysis_dict["outputOptions"] = output_options 4912 4913 else: 4914 4915 # Replace output_results and output format (if exists in param) 4916 param_exomiser_analysis_dict["outputOptions"][ 4917 "outputDirectory" 4918 ] = output_results 4919 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4920 list( 4921 set( 4922 param_exomiser_analysis_dict.get( 4923 "outputOptions", {} 4924 ).get("outputFormats", []) 4925 + ["TSV_VARIANT", "VCF"] 4926 ) 4927 ) 4928 ) 4929 4930 # log 4931 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4932 4933 ### ANALYSIS FILE ### 4934 ##################### 4935 4936 ### Full JSON analysis config file ### 4937 4938 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4939 with open(exomiser_analysis, "w") as fp: 4940 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4941 4942 ### SPLIT analysis and sample config files 4943 4944 # Splitted analysis dict 4945 param_exomiser_analysis_dict_for_split = ( 4946 param_exomiser_analysis_dict.copy() 4947 ) 4948 4949 # Phenopacket JSON file 4950 exomiser_analysis_phenopacket = os.path.join( 4951 tmp_dir, "analysis_phenopacket.json" 4952 ) 4953 with open(exomiser_analysis_phenopacket, "w") as fp: 4954 json.dump( 4955 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4956 fp, 4957 indent=4, 4958 ) 4959 4960 # Analysis JSON file without Phenopacket parameters 4961 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4962 exomiser_analysis_analysis = os.path.join( 4963 tmp_dir, "analysis_analysis.json" 4964 ) 4965 with open(exomiser_analysis_analysis, "w") as fp: 4966 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4967 4968 ### INITAL VCF file ### 4969 ####################### 4970 4971 ### Create list of samples to use and include inti initial VCF file #### 4972 4973 # Subject (main sample) 4974 # Get sample ID in analysis dict 4975 sample_subject = ( 4976 param_exomiser_analysis_dict.get("phenopacket", {}) 4977 .get("subject", {}) 4978 .get("id", None) 4979 ) 4980 sample_proband = ( 4981 param_exomiser_analysis_dict.get("phenopacket", {}) 4982 .get("proband", {}) 4983 .get("subject", {}) 4984 .get("id", None) 4985 ) 4986 sample = [] 4987 if sample_subject: 4988 sample.append(sample_subject) 4989 if sample_proband: 4990 sample.append(sample_proband) 4991 4992 # Get sample ID within Pedigree 4993 pedigree_persons_list = ( 4994 param_exomiser_analysis_dict.get("phenopacket", {}) 4995 .get("pedigree", {}) 4996 .get("persons", {}) 4997 ) 4998 4999 # Create list with all sample ID in pedigree (if exists) 5000 pedigree_persons = [] 5001 for person in pedigree_persons_list: 5002 pedigree_persons.append(person.get("individualId")) 5003 5004 # Concat subject sample ID and samples ID in pedigreesamples 5005 samples = list(set(sample + pedigree_persons)) 5006 5007 # Check if sample list is not empty 5008 if not samples: 5009 log.error(f"No samples found") 5010 raise ValueError(f"No samples found") 5011 5012 # Create VCF with sample (either sample in param or first one by default) 5013 # Export VCF file 5014 self.export_variant_vcf( 5015 vcf_file=tmp_vcf_name, 5016 remove_info=True, 5017 add_samples=True, 5018 list_samples=samples, 5019 index=False, 5020 ) 5021 5022 ### Execute Exomiser ### 5023 ######################## 5024 5025 # Init command 5026 exomiser_command = "" 5027 5028 # Command exomiser options 5029 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 5030 5031 # Release 5032 exomiser_release = param_exomiser.get("release", None) 5033 if exomiser_release: 5034 # phenotype data version 5035 exomiser_options += ( 5036 f" --exomiser.phenotype.data-version={exomiser_release} " 5037 ) 5038 # data version 5039 exomiser_options += ( 5040 f" --exomiser.{assembly}.data-version={exomiser_release} " 5041 ) 5042 # variant white list 5043 variant_white_list_file = ( 5044 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 5045 ) 5046 if os.path.exists( 5047 os.path.join( 5048 databases_folders, assembly, variant_white_list_file 5049 ) 5050 ): 5051 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 5052 5053 # transcript_source 5054 transcript_source = param_exomiser.get( 5055 "transcript_source", None 5056 ) # ucsc, refseq, ensembl 5057 if transcript_source: 5058 exomiser_options += ( 5059 f" --exomiser.{assembly}.transcript-source={transcript_source} " 5060 ) 5061 5062 # If analysis contain proband param 5063 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 5064 "proband", {} 5065 ): 5066 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 5067 5068 # If no proband (usually uniq sample) 5069 else: 5070 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 5071 5072 # Log 5073 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 5074 5075 # Run command 5076 result = subprocess.call( 5077 exomiser_command_analysis.split(), stdout=subprocess.PIPE 5078 ) 5079 if result: 5080 log.error("Exomiser command failed") 5081 raise ValueError("Exomiser command failed") 5082 5083 ### RESULTS ### 5084 ############### 5085 5086 ### Annotate with TSV fields ### 5087 5088 # Init result tsv file 5089 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 5090 5091 # Init result tsv file 5092 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 5093 5094 # Parse TSV file and explode columns in INFO field 5095 if exomiser_to_info and os.path.exists(output_results_tsv): 5096 5097 # Log 5098 log.debug("Exomiser columns to VCF INFO field") 5099 5100 # Retrieve columns and types 5101 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 5102 output_results_tsv_df = self.get_query_to_df(query) 5103 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 5104 5105 # Init concat fields for update 5106 sql_query_update_concat_fields = [] 5107 5108 # Fields to avoid 5109 fields_to_avoid = [ 5110 "CONTIG", 5111 "START", 5112 "END", 5113 "REF", 5114 "ALT", 5115 "QUAL", 5116 "FILTER", 5117 "GENOTYPE", 5118 ] 5119 5120 # List all columns to add into header 5121 for header_column in output_results_tsv_columns: 5122 5123 # If header column is enable 5124 if header_column not in fields_to_avoid: 5125 5126 # Header info type 5127 header_info_type = "String" 5128 header_column_df = output_results_tsv_df[header_column] 5129 header_column_df_dtype = header_column_df.dtype 5130 if header_column_df_dtype == object: 5131 if ( 5132 pd.to_numeric(header_column_df, errors="coerce") 5133 .notnull() 5134 .all() 5135 ): 5136 header_info_type = "Float" 5137 else: 5138 header_info_type = "Integer" 5139 5140 # Header info 5141 characters_to_validate = ["-"] 5142 pattern = "[" + "".join(characters_to_validate) + "]" 5143 header_info_name = re.sub( 5144 pattern, 5145 "_", 5146 f"Exomiser_{header_column}".replace("#", ""), 5147 ) 5148 header_info_number = "." 5149 header_info_description = ( 5150 f"Exomiser {header_column} annotation" 5151 ) 5152 header_info_source = "Exomiser" 5153 header_info_version = "unknown" 5154 header_info_code = CODE_TYPE_MAP[header_info_type] 5155 vcf_reader.infos[header_info_name] = vcf.parser._Info( 5156 header_info_name, 5157 header_info_number, 5158 header_info_type, 5159 header_info_description, 5160 header_info_source, 5161 header_info_version, 5162 header_info_code, 5163 ) 5164 5165 # Add field to add for update to concat fields 5166 sql_query_update_concat_fields.append( 5167 f""" 5168 CASE 5169 WHEN table_parquet."{header_column}" NOT IN ('','.') 5170 THEN concat( 5171 '{header_info_name}=', 5172 table_parquet."{header_column}", 5173 ';' 5174 ) 5175 5176 ELSE '' 5177 END 5178 """ 5179 ) 5180 5181 # Update query 5182 sql_query_update = f""" 5183 UPDATE {table_variants} as table_variants 5184 SET INFO = concat( 5185 CASE 5186 WHEN INFO NOT IN ('', '.') 5187 THEN INFO 5188 ELSE '' 5189 END, 5190 CASE 5191 WHEN table_variants.INFO NOT IN ('','.') 5192 THEN ';' 5193 ELSE '' 5194 END, 5195 ( 5196 SELECT 5197 concat( 5198 {",".join(sql_query_update_concat_fields)} 5199 ) 5200 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 5201 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 5202 AND table_parquet.\"START\" = table_variants.\"POS\" 5203 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5204 AND table_parquet.\"REF\" = table_variants.\"REF\" 5205 ) 5206 ) 5207 ; 5208 """ 5209 5210 # Update 5211 self.conn.execute(sql_query_update) 5212 5213 ### Annotate with VCF INFO field ### 5214 5215 # Init result VCF file 5216 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 5217 5218 # If VCF exists 5219 if os.path.exists(output_results_vcf): 5220 5221 # Log 5222 log.debug("Exomiser result VCF update variants") 5223 5224 # Find Exomiser INFO field annotation in header 5225 with gzip.open(output_results_vcf, "rt") as f: 5226 header_list = self.read_vcf_header(f) 5227 exomiser_vcf_header = vcf.Reader( 5228 io.StringIO("\n".join(header_list)) 5229 ) 5230 5231 # Add annotation INFO field to header 5232 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 5233 5234 # Update variants with VCF 5235 self.update_from_vcf(output_results_vcf) 5236 5237 return True 5238 5239 def annotation_snpeff(self, threads: int = None) -> None: 5240 """ 5241 This function annotate with snpEff 5242 5243 :param threads: The number of threads to use 5244 :return: the value of the variable "return_value". 5245 """ 5246 5247 # DEBUG 5248 log.debug("Start annotation with snpeff databases") 5249 5250 # Threads 5251 if not threads: 5252 threads = self.get_threads() 5253 log.debug("Threads: " + str(threads)) 5254 5255 # DEBUG 5256 delete_tmp = True 5257 if self.get_config().get("verbosity", "warning") in ["debug"]: 5258 delete_tmp = False 5259 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5260 5261 # Config 5262 config = self.get_config() 5263 log.debug("Config: " + str(config)) 5264 5265 # Config - Folders - Databases 5266 databases_folders = ( 5267 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 5268 ) 5269 log.debug("Databases annotations: " + str(databases_folders)) 5270 5271 # Config - snpEff bin command 5272 snpeff_bin_command = get_bin_command( 5273 bin="snpEff.jar", 5274 tool="snpeff", 5275 bin_type="jar", 5276 config=config, 5277 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 5278 ) 5279 if not snpeff_bin_command: 5280 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 5281 log.error(msg_err) 5282 raise ValueError(msg_err) 5283 5284 # Config - snpEff databases 5285 snpeff_databases = ( 5286 config.get("folders", {}) 5287 .get("databases", {}) 5288 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 5289 ) 5290 snpeff_databases = full_path(snpeff_databases) 5291 if snpeff_databases is not None and snpeff_databases != "": 5292 log.debug(f"Create snpEff databases folder") 5293 if not os.path.exists(snpeff_databases): 5294 os.makedirs(snpeff_databases) 5295 5296 # Param 5297 param = self.get_param() 5298 log.debug("Param: " + str(param)) 5299 5300 # Param 5301 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 5302 log.debug("Options: " + str(options)) 5303 5304 # Param - Assembly 5305 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5306 5307 # Param - Options 5308 snpeff_options = ( 5309 param.get("annotation", {}).get("snpeff", {}).get("options", "") 5310 ) 5311 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 5312 snpeff_csvstats = ( 5313 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 5314 ) 5315 if snpeff_stats: 5316 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 5317 snpeff_stats = full_path(snpeff_stats) 5318 snpeff_options += f" -stats {snpeff_stats}" 5319 if snpeff_csvstats: 5320 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 5321 snpeff_csvstats = full_path(snpeff_csvstats) 5322 snpeff_options += f" -csvStats {snpeff_csvstats}" 5323 5324 # Data 5325 table_variants = self.get_table_variants() 5326 5327 # Check if not empty 5328 log.debug("Check if not empty") 5329 sql_query_chromosomes = ( 5330 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5331 ) 5332 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 5333 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5334 log.info(f"VCF empty") 5335 return 5336 5337 # Export in VCF 5338 log.debug("Create initial file to annotate") 5339 tmp_vcf = NamedTemporaryFile( 5340 prefix=self.get_prefix(), 5341 dir=self.get_tmp_dir(), 5342 suffix=".vcf.gz", 5343 delete=True, 5344 ) 5345 tmp_vcf_name = tmp_vcf.name 5346 5347 # VCF header 5348 vcf_reader = self.get_header() 5349 log.debug("Initial header: " + str(vcf_reader.infos)) 5350 5351 # Existing annotations 5352 for vcf_annotation in self.get_header().infos: 5353 5354 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5355 log.debug( 5356 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5357 ) 5358 5359 # Memory limit 5360 # if config.get("memory", None): 5361 # memory_limit = config.get("memory", "8G") 5362 # else: 5363 # memory_limit = "8G" 5364 memory_limit = self.get_memory("8G") 5365 log.debug(f"memory_limit: {memory_limit}") 5366 5367 # snpEff java options 5368 snpeff_java_options = ( 5369 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5370 ) 5371 log.debug(f"Exomiser java options: {snpeff_java_options}") 5372 5373 force_update_annotation = True 5374 5375 if "ANN" not in self.get_header().infos or force_update_annotation: 5376 5377 # Check snpEff database 5378 log.debug(f"Check snpEff databases {[assembly]}") 5379 databases_download_snpeff( 5380 folder=snpeff_databases, assemblies=[assembly], config=config 5381 ) 5382 5383 # Export VCF file 5384 self.export_variant_vcf( 5385 vcf_file=tmp_vcf_name, 5386 remove_info=True, 5387 add_samples=False, 5388 index=True, 5389 ) 5390 5391 # Tmp file 5392 err_files = [] 5393 tmp_annotate_vcf = NamedTemporaryFile( 5394 prefix=self.get_prefix(), 5395 dir=self.get_tmp_dir(), 5396 suffix=".vcf", 5397 delete=False, 5398 ) 5399 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5400 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5401 err_files.append(tmp_annotate_vcf_name_err) 5402 5403 # Command 5404 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5405 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5406 run_parallel_commands([snpeff_command], 1) 5407 5408 # Error messages 5409 log.info(f"Error/Warning messages:") 5410 error_message_command_all = [] 5411 error_message_command_warning = [] 5412 error_message_command_err = [] 5413 for err_file in err_files: 5414 with open(err_file, "r") as f: 5415 for line in f: 5416 message = line.strip() 5417 error_message_command_all.append(message) 5418 if line.startswith("[W::"): 5419 error_message_command_warning.append(message) 5420 if line.startswith("[E::"): 5421 error_message_command_err.append(f"{err_file}: " + message) 5422 # log info 5423 for message in list( 5424 set(error_message_command_err + error_message_command_warning) 5425 ): 5426 log.info(f" {message}") 5427 # debug info 5428 for message in list(set(error_message_command_all)): 5429 log.debug(f" {message}") 5430 # failed 5431 if len(error_message_command_err): 5432 log.error("Annotation failed: Error in commands") 5433 raise ValueError("Annotation failed: Error in commands") 5434 5435 # Find annotation in header 5436 with open(tmp_annotate_vcf_name, "rt") as f: 5437 header_list = self.read_vcf_header(f) 5438 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5439 5440 for ann in annovar_vcf_header.infos: 5441 if ann not in self.get_header().infos: 5442 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5443 5444 # Update variants 5445 log.info(f"Annotation - Updating...") 5446 self.update_from_vcf(tmp_annotate_vcf_name) 5447 5448 else: 5449 if "ANN" in self.get_header().infos: 5450 log.debug(f"Existing snpEff annotations in VCF") 5451 if force_update_annotation: 5452 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 5453 5454 def annotation_annovar(self, threads: int = None) -> None: 5455 """ 5456 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5457 annotations 5458 5459 :param threads: number of threads to use 5460 :return: the value of the variable "return_value". 5461 """ 5462 5463 # DEBUG 5464 log.debug("Start annotation with Annovar databases") 5465 5466 # Threads 5467 if not threads: 5468 threads = self.get_threads() 5469 log.debug("Threads: " + str(threads)) 5470 5471 # Tmp en Err files 5472 tmp_files = [] 5473 err_files = [] 5474 5475 # DEBUG 5476 delete_tmp = True 5477 if self.get_config().get("verbosity", "warning") in ["debug"]: 5478 delete_tmp = False 5479 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5480 5481 # Config 5482 config = self.get_config() 5483 log.debug("Config: " + str(config)) 5484 5485 # Config - Folders - Databases 5486 databases_folders = ( 5487 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5488 ) 5489 log.debug("Databases annotations: " + str(databases_folders)) 5490 5491 # Config - annovar bin command 5492 annovar_bin_command = get_bin_command( 5493 bin="table_annovar.pl", 5494 tool="annovar", 5495 bin_type="perl", 5496 config=config, 5497 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5498 ) 5499 if not annovar_bin_command: 5500 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5501 log.error(msg_err) 5502 raise ValueError(msg_err) 5503 5504 # Config - BCFTools bin command 5505 bcftools_bin_command = get_bin_command( 5506 bin="bcftools", 5507 tool="bcftools", 5508 bin_type="bin", 5509 config=config, 5510 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5511 ) 5512 if not bcftools_bin_command: 5513 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5514 log.error(msg_err) 5515 raise ValueError(msg_err) 5516 5517 # Config - annovar databases 5518 annovar_databases = ( 5519 config.get("folders", {}) 5520 .get("databases", {}) 5521 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5522 ) 5523 if annovar_databases is not None: 5524 if isinstance(annovar_databases, list): 5525 annovar_databases = full_path(annovar_databases[0]) 5526 log.warning(f"Annovar databases folder '{annovar_databases}' selected") 5527 annovar_databases = full_path(annovar_databases) 5528 if not os.path.exists(annovar_databases): 5529 log.info(f"Annovar databases folder '{annovar_databases}' created") 5530 Path(annovar_databases).mkdir(parents=True, exist_ok=True) 5531 else: 5532 msg_err = f"Annovar databases configuration failed" 5533 log.error(msg_err) 5534 raise ValueError(msg_err) 5535 5536 # Param 5537 param = self.get_param() 5538 log.debug("Param: " + str(param)) 5539 5540 # Param - options 5541 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5542 log.debug("Options: " + str(options)) 5543 5544 # Param - annotations 5545 annotations = ( 5546 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5547 ) 5548 log.debug("Annotations: " + str(annotations)) 5549 5550 # Param - Assembly 5551 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5552 5553 # Annovar database assembly 5554 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5555 if annovar_databases_assembly != "" and not os.path.exists( 5556 annovar_databases_assembly 5557 ): 5558 os.makedirs(annovar_databases_assembly) 5559 5560 # Data 5561 table_variants = self.get_table_variants() 5562 5563 # Check if not empty 5564 log.debug("Check if not empty") 5565 sql_query_chromosomes = ( 5566 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5567 ) 5568 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5569 if not sql_query_chromosomes_df["count"][0]: 5570 log.info(f"VCF empty") 5571 return 5572 5573 # VCF header 5574 vcf_reader = self.get_header() 5575 log.debug("Initial header: " + str(vcf_reader.infos)) 5576 5577 # Existing annotations 5578 for vcf_annotation in self.get_header().infos: 5579 5580 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5581 log.debug( 5582 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5583 ) 5584 5585 force_update_annotation = True 5586 5587 if annotations: 5588 5589 commands = [] 5590 tmp_annotates_vcf_name_list = [] 5591 5592 # Export in VCF 5593 log.debug("Create initial file to annotate") 5594 tmp_vcf = NamedTemporaryFile( 5595 prefix=self.get_prefix(), 5596 dir=self.get_tmp_dir(), 5597 suffix=".vcf.gz", 5598 delete=False, 5599 ) 5600 tmp_vcf_name = tmp_vcf.name 5601 tmp_files.append(tmp_vcf_name) 5602 tmp_files.append(tmp_vcf_name + ".tbi") 5603 5604 # Export VCF file 5605 self.export_variant_vcf( 5606 vcf_file=tmp_vcf_name, 5607 remove_info=".", 5608 add_samples=False, 5609 index=True, 5610 ) 5611 5612 # Create file for field rename 5613 log.debug("Create file for field rename") 5614 tmp_rename = NamedTemporaryFile( 5615 prefix=self.get_prefix(), 5616 dir=self.get_tmp_dir(), 5617 suffix=".rename", 5618 delete=False, 5619 ) 5620 tmp_rename_name = tmp_rename.name 5621 tmp_files.append(tmp_rename_name) 5622 5623 # Check Annovar database 5624 log.debug( 5625 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5626 ) 5627 databases_download_annovar( 5628 folder=annovar_databases, 5629 files=list(annotations.keys()), 5630 assemblies=[assembly], 5631 ) 5632 5633 for annotation in annotations: 5634 annotation_fields = annotations[annotation] 5635 5636 if not annotation_fields: 5637 annotation_fields = {"INFO": None} 5638 5639 log.info(f"Annotations Annovar - database '{annotation}'") 5640 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5641 5642 # Tmp file for annovar 5643 err_files = [] 5644 tmp_annotate_vcf_directory = TemporaryDirectory( 5645 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5646 ) 5647 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5648 tmp_annotate_vcf_name_annovar = ( 5649 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5650 ) 5651 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5652 err_files.append(tmp_annotate_vcf_name_err) 5653 tmp_files.append(tmp_annotate_vcf_name_err) 5654 5655 # Tmp file final vcf annotated by annovar 5656 tmp_annotate_vcf = NamedTemporaryFile( 5657 prefix=self.get_prefix(), 5658 dir=self.get_tmp_dir(), 5659 suffix=".vcf.gz", 5660 delete=False, 5661 ) 5662 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5663 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5664 tmp_files.append(tmp_annotate_vcf_name) 5665 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5666 5667 # Number of fields 5668 annotation_list = [] 5669 annotation_renamed_list = [] 5670 5671 for annotation_field in annotation_fields: 5672 5673 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5674 annotation_fields_new_name = annotation_fields.get( 5675 annotation_field, annotation_field 5676 ) 5677 if not annotation_fields_new_name: 5678 annotation_fields_new_name = annotation_field 5679 5680 if ( 5681 force_update_annotation 5682 or annotation_fields_new_name not in self.get_header().infos 5683 ): 5684 annotation_list.append(annotation_field) 5685 annotation_renamed_list.append(annotation_fields_new_name) 5686 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5687 log.warning( 5688 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5689 ) 5690 5691 # Add rename info 5692 run_parallel_commands( 5693 [ 5694 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5695 ], 5696 1, 5697 ) 5698 5699 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5700 log.debug("annotation_list: " + str(annotation_list)) 5701 5702 # protocol 5703 protocol = annotation 5704 5705 # argument 5706 argument = "" 5707 5708 # operation 5709 operation = "f" 5710 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5711 "ensGene" 5712 ): 5713 operation = "g" 5714 if options.get("genebase", None): 5715 argument = f"""'{options.get("genebase","")}'""" 5716 elif annotation in ["cytoBand"]: 5717 operation = "r" 5718 5719 # argument option 5720 argument_option = "" 5721 if argument != "": 5722 argument_option = " --argument " + argument 5723 5724 # command options 5725 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5726 for option in options: 5727 if option not in ["genebase"]: 5728 command_options += f""" --{option}={options[option]}""" 5729 5730 # Command 5731 5732 # Command - Annovar 5733 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5734 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5735 5736 # Command - start pipe 5737 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5738 5739 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5740 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5741 5742 # Command - Special characters (refGene annotation) 5743 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5744 5745 # Command - Clean empty fields (with value ".") 5746 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5747 5748 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5749 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5750 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5751 # for ann in annotation_renamed_list: 5752 for ann in annotation_list: 5753 annovar_fields_to_keep.append(f"^INFO/{ann}") 5754 5755 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5756 5757 # Command - indexing 5758 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5759 5760 log.debug(f"Annotation - Annovar command: {command_annovar}") 5761 run_parallel_commands([command_annovar], 1) 5762 5763 # Error messages 5764 log.info(f"Error/Warning messages:") 5765 error_message_command_all = [] 5766 error_message_command_warning = [] 5767 error_message_command_err = [] 5768 for err_file in err_files: 5769 with open(err_file, "r") as f: 5770 for line in f: 5771 message = line.strip() 5772 error_message_command_all.append(message) 5773 if line.startswith("[W::") or line.startswith("WARNING"): 5774 error_message_command_warning.append(message) 5775 if line.startswith("[E::") or line.startswith("ERROR"): 5776 error_message_command_err.append( 5777 f"{err_file}: " + message 5778 ) 5779 # log info 5780 for message in list( 5781 set(error_message_command_err + error_message_command_warning) 5782 ): 5783 log.info(f" {message}") 5784 # debug info 5785 for message in list(set(error_message_command_all)): 5786 log.debug(f" {message}") 5787 # failed 5788 if len(error_message_command_err): 5789 log.error("Annotation failed: Error in commands") 5790 raise ValueError("Annotation failed: Error in commands") 5791 5792 if tmp_annotates_vcf_name_list: 5793 5794 # List of annotated files 5795 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5796 5797 # Tmp file 5798 tmp_annotate_vcf = NamedTemporaryFile( 5799 prefix=self.get_prefix(), 5800 dir=self.get_tmp_dir(), 5801 suffix=".vcf.gz", 5802 delete=False, 5803 ) 5804 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5805 tmp_files.append(tmp_annotate_vcf_name) 5806 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5807 err_files.append(tmp_annotate_vcf_name_err) 5808 tmp_files.append(tmp_annotate_vcf_name_err) 5809 5810 # Command merge 5811 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5812 log.info( 5813 f"Annotation Annovar - Annotation merging " 5814 + str(len(tmp_annotates_vcf_name_list)) 5815 + " annotated files" 5816 ) 5817 log.debug(f"Annotation - merge command: {merge_command}") 5818 run_parallel_commands([merge_command], 1) 5819 5820 # Find annotation in header 5821 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5822 header_list = self.read_vcf_header(f) 5823 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5824 5825 for ann in annovar_vcf_header.infos: 5826 if ann not in self.get_header().infos: 5827 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5828 5829 # Update variants 5830 log.info(f"Annotation Annovar - Updating...") 5831 self.update_from_vcf(tmp_annotate_vcf_name) 5832 5833 # Clean files 5834 # Tmp file remove command 5835 if True: 5836 tmp_files_remove_command = "" 5837 if tmp_files: 5838 tmp_files_remove_command = " ".join(tmp_files) 5839 clean_command = f" rm -f {tmp_files_remove_command} " 5840 log.debug(f"Annotation Annovar - Annotation cleaning ") 5841 log.debug(f"Annotation - cleaning command: {clean_command}") 5842 run_parallel_commands([clean_command], 1) 5843 5844 # Parquet 5845 def annotation_parquet(self, threads: int = None) -> None: 5846 """ 5847 It takes a VCF file, and annotates it with a parquet file 5848 5849 :param threads: number of threads to use for the annotation 5850 :return: the value of the variable "result". 5851 """ 5852 5853 # DEBUG 5854 log.debug("Start annotation with parquet databases") 5855 5856 # Threads 5857 if not threads: 5858 threads = self.get_threads() 5859 log.debug("Threads: " + str(threads)) 5860 5861 # DEBUG 5862 delete_tmp = True 5863 if self.get_config().get("verbosity", "warning") in ["debug"]: 5864 delete_tmp = False 5865 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5866 5867 # Config 5868 databases_folders = set( 5869 self.get_config() 5870 .get("folders", {}) 5871 .get("databases", {}) 5872 .get("annotations", ["."]) 5873 + self.get_config() 5874 .get("folders", {}) 5875 .get("databases", {}) 5876 .get("parquet", ["."]) 5877 ) 5878 log.debug("Databases annotations: " + str(databases_folders)) 5879 5880 # Param 5881 annotations = ( 5882 self.get_param() 5883 .get("annotation", {}) 5884 .get("parquet", {}) 5885 .get("annotations", None) 5886 ) 5887 log.debug("Annotations: " + str(annotations)) 5888 5889 # Assembly 5890 assembly = self.get_param().get( 5891 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5892 ) 5893 5894 # Force Update Annotation 5895 force_update_annotation = ( 5896 self.get_param() 5897 .get("annotation", {}) 5898 .get("options", {}) 5899 .get("annotations_update", False) 5900 ) 5901 log.debug(f"force_update_annotation={force_update_annotation}") 5902 force_append_annotation = ( 5903 self.get_param() 5904 .get("annotation", {}) 5905 .get("options", {}) 5906 .get("annotations_append", False) 5907 ) 5908 log.debug(f"force_append_annotation={force_append_annotation}") 5909 5910 # Data 5911 table_variants = self.get_table_variants() 5912 5913 # Check if not empty 5914 log.debug("Check if not empty") 5915 sql_query_chromosomes_df = self.get_query_to_df( 5916 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5917 ) 5918 if not sql_query_chromosomes_df["count"][0]: 5919 log.info(f"VCF empty") 5920 return 5921 5922 # VCF header 5923 vcf_reader = self.get_header() 5924 log.debug("Initial header: " + str(vcf_reader.infos)) 5925 5926 # Nb Variants POS 5927 log.debug("NB Variants Start") 5928 nb_variants = self.conn.execute( 5929 f"SELECT count(*) AS count FROM variants" 5930 ).fetchdf()["count"][0] 5931 log.debug("NB Variants Stop") 5932 5933 # Existing annotations 5934 for vcf_annotation in self.get_header().infos: 5935 5936 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5937 log.debug( 5938 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5939 ) 5940 5941 # Added columns 5942 added_columns = [] 5943 5944 # drop indexes 5945 log.debug(f"Drop indexes...") 5946 self.drop_indexes() 5947 5948 if annotations: 5949 5950 if "ALL" in annotations: 5951 5952 all_param = annotations.get("ALL", {}) 5953 all_param_formats = all_param.get("formats", None) 5954 all_param_releases = all_param.get("releases", None) 5955 5956 databases_infos_dict = self.scan_databases( 5957 database_formats=all_param_formats, 5958 database_releases=all_param_releases, 5959 ) 5960 for database_infos in databases_infos_dict.keys(): 5961 if database_infos not in annotations: 5962 annotations[database_infos] = {"INFO": None} 5963 5964 for annotation in annotations: 5965 5966 if annotation in ["ALL"]: 5967 continue 5968 5969 # Annotation Name 5970 annotation_name = os.path.basename(annotation) 5971 5972 # Annotation fields 5973 annotation_fields = annotations[annotation] 5974 if not annotation_fields: 5975 annotation_fields = {"INFO": None} 5976 5977 log.debug(f"Annotation '{annotation_name}'") 5978 log.debug( 5979 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5980 ) 5981 5982 # Create Database 5983 database = Database( 5984 database=annotation, 5985 databases_folders=databases_folders, 5986 assembly=assembly, 5987 ) 5988 5989 # Find files 5990 parquet_file = database.get_database() 5991 parquet_hdr_file = database.get_header_file() 5992 parquet_type = database.get_type() 5993 5994 # Check if files exists 5995 if not parquet_file or not parquet_hdr_file: 5996 msg_err_list = [] 5997 if not parquet_file: 5998 msg_err_list.append( 5999 f"Annotation failed: Annotation file not found" 6000 ) 6001 if parquet_file and not parquet_hdr_file: 6002 msg_err_list.append( 6003 f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'" 6004 ) 6005 6006 log.error(". ".join(msg_err_list)) 6007 raise ValueError(". ".join(msg_err_list)) 6008 else: 6009 # Get parquet connexion 6010 parquet_sql_attach = database.get_sql_database_attach( 6011 output="query" 6012 ) 6013 if parquet_sql_attach: 6014 self.conn.execute(parquet_sql_attach) 6015 parquet_file_link = database.get_sql_database_link() 6016 # Log 6017 log.debug( 6018 f"Annotation '{annotation_name}' - file: " 6019 + str(parquet_file) 6020 + " and " 6021 + str(parquet_hdr_file) 6022 ) 6023 6024 # Database full header columns 6025 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 6026 parquet_hdr_file 6027 ) 6028 # Log 6029 log.debug( 6030 "Annotation database header columns : " 6031 + str(parquet_hdr_vcf_header_columns) 6032 ) 6033 6034 # Load header as VCF object 6035 parquet_hdr_vcf_header_infos = database.get_header().infos 6036 # Log 6037 log.debug( 6038 "Annotation database header: " 6039 + str(parquet_hdr_vcf_header_infos) 6040 ) 6041 6042 # Get extra infos 6043 parquet_columns = database.get_extra_columns() 6044 # Log 6045 log.debug("Annotation database Columns: " + str(parquet_columns)) 6046 6047 # Add extra columns if "ALL" in annotation_fields 6048 # if "ALL" in annotation_fields: 6049 # allow_add_extra_column = True 6050 if "ALL" in annotation_fields and database.get_extra_columns(): 6051 for extra_column in database.get_extra_columns(): 6052 if ( 6053 extra_column not in annotation_fields 6054 and extra_column.replace("INFO/", "") 6055 not in parquet_hdr_vcf_header_infos 6056 ): 6057 parquet_hdr_vcf_header_infos[extra_column] = ( 6058 vcf.parser._Info( 6059 extra_column, 6060 ".", 6061 "String", 6062 f"{extra_column} description", 6063 "unknown", 6064 "unknown", 6065 self.code_type_map["String"], 6066 ) 6067 ) 6068 6069 # For all fields in database 6070 annotation_fields_all = False 6071 if "ALL" in annotation_fields or "INFO" in annotation_fields: 6072 annotation_fields_all = True 6073 annotation_fields = { 6074 key: key for key in parquet_hdr_vcf_header_infos 6075 } 6076 6077 log.debug( 6078 "Annotation database header - All annotations added: " 6079 + str(annotation_fields) 6080 ) 6081 6082 # Init 6083 6084 # List of annotation fields to use 6085 sql_query_annotation_update_info_sets = [] 6086 6087 # List of annotation to agregate 6088 sql_query_annotation_to_agregate = [] 6089 6090 # Number of fields 6091 nb_annotation_field = 0 6092 6093 # Annotation fields processed 6094 annotation_fields_processed = [] 6095 6096 # Columns mapping 6097 map_columns = database.map_columns( 6098 columns=annotation_fields, prefixes=["INFO/"] 6099 ) 6100 6101 # Query dict for fields to remove (update option) 6102 query_dict_remove = {} 6103 6104 # Fetch Anotation fields 6105 for annotation_field in annotation_fields: 6106 6107 # annotation_field_column 6108 annotation_field_column = map_columns.get( 6109 annotation_field, "INFO" 6110 ) 6111 6112 # field new name, if parametered 6113 annotation_fields_new_name = annotation_fields.get( 6114 annotation_field, annotation_field 6115 ) 6116 if not annotation_fields_new_name: 6117 annotation_fields_new_name = annotation_field 6118 6119 # To annotate 6120 # force_update_annotation = True 6121 # force_append_annotation = True 6122 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 6123 if annotation_field in parquet_hdr_vcf_header_infos and ( 6124 force_update_annotation 6125 or force_append_annotation 6126 or ( 6127 annotation_fields_new_name 6128 not in self.get_header().infos 6129 ) 6130 ): 6131 6132 # Add field to annotation to process list 6133 annotation_fields_processed.append( 6134 annotation_fields_new_name 6135 ) 6136 6137 # explode infos for the field 6138 annotation_fields_new_name_info_msg = "" 6139 if ( 6140 force_update_annotation 6141 and annotation_fields_new_name 6142 in self.get_header().infos 6143 ): 6144 # Remove field from INFO 6145 query = f""" 6146 UPDATE {table_variants} as table_variants 6147 SET INFO = REGEXP_REPLACE( 6148 concat(table_variants.INFO,''), 6149 ';*{annotation_fields_new_name}=[^;]*', 6150 '' 6151 ) 6152 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 6153 """ 6154 annotation_fields_new_name_info_msg = " [update]" 6155 query_dict_remove[ 6156 f"remove 'INFO/{annotation_fields_new_name}'" 6157 ] = query 6158 6159 # Sep between fields in INFO 6160 nb_annotation_field += 1 6161 if nb_annotation_field > 1: 6162 annotation_field_sep = ";" 6163 else: 6164 annotation_field_sep = "" 6165 6166 log.info( 6167 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 6168 ) 6169 6170 # Add INFO field to header 6171 parquet_hdr_vcf_header_infos_number = ( 6172 parquet_hdr_vcf_header_infos[annotation_field].num 6173 or "." 6174 ) 6175 parquet_hdr_vcf_header_infos_type = ( 6176 parquet_hdr_vcf_header_infos[annotation_field].type 6177 or "String" 6178 ) 6179 parquet_hdr_vcf_header_infos_description = ( 6180 parquet_hdr_vcf_header_infos[annotation_field].desc 6181 or f"{annotation_field} description" 6182 ) 6183 parquet_hdr_vcf_header_infos_source = ( 6184 parquet_hdr_vcf_header_infos[annotation_field].source 6185 or "unknown" 6186 ) 6187 parquet_hdr_vcf_header_infos_version = ( 6188 parquet_hdr_vcf_header_infos[annotation_field].version 6189 or "unknown" 6190 ) 6191 6192 vcf_reader.infos[annotation_fields_new_name] = ( 6193 vcf.parser._Info( 6194 annotation_fields_new_name, 6195 parquet_hdr_vcf_header_infos_number, 6196 parquet_hdr_vcf_header_infos_type, 6197 parquet_hdr_vcf_header_infos_description, 6198 parquet_hdr_vcf_header_infos_source, 6199 parquet_hdr_vcf_header_infos_version, 6200 self.code_type_map[ 6201 parquet_hdr_vcf_header_infos_type 6202 ], 6203 ) 6204 ) 6205 6206 # Append 6207 if force_append_annotation: 6208 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 6209 else: 6210 query_case_when_append = "" 6211 6212 # Annotation/Update query fields 6213 # Found in INFO column 6214 if ( 6215 annotation_field_column == "INFO" 6216 and "INFO" in parquet_hdr_vcf_header_columns 6217 ): 6218 sql_query_annotation_update_info_sets.append( 6219 f""" 6220 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 6221 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 6222 ELSE '' 6223 END 6224 """ 6225 ) 6226 # Found in a specific column 6227 else: 6228 sql_query_annotation_update_info_sets.append( 6229 f""" 6230 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 6231 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 6232 ELSE '' 6233 END 6234 """ 6235 ) 6236 sql_query_annotation_to_agregate.append( 6237 f""" string_agg(table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6238 ) 6239 6240 # Not to annotate 6241 else: 6242 6243 if force_update_annotation: 6244 annotation_message = "forced" 6245 else: 6246 annotation_message = "skipped" 6247 6248 if annotation_field not in parquet_hdr_vcf_header_infos: 6249 log.warning( 6250 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 6251 ) 6252 if annotation_fields_new_name in self.get_header().infos: 6253 log.warning( 6254 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 6255 ) 6256 6257 # Check if ALL fields have to be annotated. Thus concat all INFO field 6258 # allow_annotation_full_info = True 6259 allow_annotation_full_info = not force_append_annotation 6260 6261 if parquet_type in ["regions"]: 6262 allow_annotation_full_info = False 6263 6264 if ( 6265 allow_annotation_full_info 6266 and nb_annotation_field == len(annotation_fields) 6267 and annotation_fields_all 6268 and ( 6269 "INFO" in parquet_hdr_vcf_header_columns 6270 and "INFO" in database.get_extra_columns() 6271 ) 6272 ): 6273 log.debug("Column INFO annotation enabled") 6274 sql_query_annotation_update_info_sets = [] 6275 sql_query_annotation_update_info_sets.append( 6276 f" table_parquet.INFO " 6277 ) 6278 6279 if sql_query_annotation_update_info_sets: 6280 6281 # Annotate 6282 log.info(f"Annotation '{annotation_name}' - Annotation...") 6283 6284 # Join query annotation update info sets for SQL 6285 sql_query_annotation_update_info_sets_sql = ",".join( 6286 sql_query_annotation_update_info_sets 6287 ) 6288 6289 # Check chromosomes list (and variants infos) 6290 sql_query_chromosomes = f""" 6291 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 6292 FROM {table_variants} as table_variants 6293 GROUP BY table_variants."#CHROM" 6294 ORDER BY table_variants."#CHROM" 6295 """ 6296 sql_query_chromosomes_df = self.conn.execute( 6297 sql_query_chromosomes 6298 ).df() 6299 sql_query_chromosomes_dict = { 6300 entry["CHROM"]: { 6301 "count": entry["count_variants"], 6302 "min": entry["min_variants"], 6303 "max": entry["max_variants"], 6304 } 6305 for index, entry in sql_query_chromosomes_df.iterrows() 6306 } 6307 6308 # Init 6309 nb_of_query = 0 6310 nb_of_variant_annotated = 0 6311 query_dict = query_dict_remove 6312 6313 # for chrom in sql_query_chromosomes_df["CHROM"]: 6314 for chrom in sql_query_chromosomes_dict: 6315 6316 # Number of variant by chromosome 6317 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 6318 chrom, {} 6319 ).get("count", 0) 6320 6321 log.debug( 6322 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 6323 ) 6324 6325 # Annotation with regions database 6326 if parquet_type in ["regions"]: 6327 sql_query_annotation_from_clause = f""" 6328 FROM ( 6329 SELECT 6330 '{chrom}' AS \"#CHROM\", 6331 table_variants_from.\"POS\" AS \"POS\", 6332 {",".join(sql_query_annotation_to_agregate)} 6333 FROM {table_variants} as table_variants_from 6334 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 6335 table_parquet_from."#CHROM" = '{chrom}' 6336 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 6337 AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 6338 ) 6339 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 6340 GROUP BY table_variants_from.\"POS\" 6341 ) 6342 as table_parquet 6343 """ 6344 6345 sql_query_annotation_where_clause = """ 6346 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6347 AND table_parquet.\"POS\" = table_variants.\"POS\" 6348 """ 6349 6350 # Annotation with variants database 6351 else: 6352 sql_query_annotation_from_clause = f""" 6353 FROM {parquet_file_link} as table_parquet 6354 """ 6355 sql_query_annotation_where_clause = f""" 6356 table_variants."#CHROM" = '{chrom}' 6357 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6358 AND table_parquet.\"POS\" = table_variants.\"POS\" 6359 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 6360 AND table_parquet.\"REF\" = table_variants.\"REF\" 6361 """ 6362 6363 # Create update query 6364 sql_query_annotation_chrom_interval_pos = f""" 6365 UPDATE {table_variants} as table_variants 6366 SET INFO = 6367 concat( 6368 CASE WHEN table_variants.INFO NOT IN ('','.') 6369 THEN table_variants.INFO 6370 ELSE '' 6371 END 6372 , 6373 CASE WHEN table_variants.INFO NOT IN ('','.') 6374 AND ( 6375 concat({sql_query_annotation_update_info_sets_sql}) 6376 ) 6377 NOT IN ('','.') 6378 THEN ';' 6379 ELSE '' 6380 END 6381 , 6382 {sql_query_annotation_update_info_sets_sql} 6383 ) 6384 {sql_query_annotation_from_clause} 6385 WHERE {sql_query_annotation_where_clause} 6386 ; 6387 """ 6388 6389 # Add update query to dict 6390 query_dict[ 6391 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6392 ] = sql_query_annotation_chrom_interval_pos 6393 6394 nb_of_query = len(query_dict) 6395 num_query = 0 6396 6397 # SET max_expression_depth TO x 6398 self.conn.execute("SET max_expression_depth TO 10000") 6399 6400 for query_name in query_dict: 6401 query = query_dict[query_name] 6402 num_query += 1 6403 log.info( 6404 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6405 ) 6406 result = self.conn.execute(query) 6407 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6408 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6409 log.info( 6410 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6411 ) 6412 6413 log.info( 6414 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6415 ) 6416 6417 else: 6418 6419 log.info( 6420 f"Annotation '{annotation_name}' - No Annotations available" 6421 ) 6422 6423 log.debug("Final header: " + str(vcf_reader.infos)) 6424 6425 # Remove added columns 6426 for added_column in added_columns: 6427 self.drop_column(column=added_column) 6428 6429 def annotation_splice(self, threads: int = None) -> None: 6430 """ 6431 This function annotate with snpEff 6432 6433 :param threads: The number of threads to use 6434 :return: the value of the variable "return_value". 6435 """ 6436 6437 # DEBUG 6438 log.debug("Start annotation with splice tools") 6439 6440 # Threads 6441 if not threads: 6442 threads = self.get_threads() 6443 log.debug("Threads: " + str(threads)) 6444 6445 # DEBUG 6446 delete_tmp = True 6447 if self.get_config().get("verbosity", "warning") in ["debug"]: 6448 delete_tmp = False 6449 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6450 6451 # Config 6452 config = self.get_config() 6453 log.debug("Config: " + str(config)) 6454 splice_config = config.get("tools", {}).get("splice", {}) 6455 if not splice_config: 6456 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6457 msg_err = "No Splice tool config" 6458 raise ValueError(msg_err) 6459 log.debug(f"splice_config: {splice_config}") 6460 6461 # Config - Folders - Databases 6462 databases_folders = ( 6463 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6464 ) 6465 log.debug("Databases annotations: " + str(databases_folders)) 6466 6467 # Splice docker image 6468 splice_docker_image = splice_config.get("docker").get("image") 6469 6470 # Pull splice image if it's not already there 6471 if not check_docker_image_exists(splice_docker_image): 6472 log.warning( 6473 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6474 ) 6475 try: 6476 command(f"docker pull {splice_config.get('docker').get('image')}") 6477 except subprocess.CalledProcessError: 6478 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6479 log.error(msg_err) 6480 raise ValueError(msg_err) 6481 6482 # Config - splice databases 6483 splice_databases = ( 6484 config.get("folders", {}) 6485 .get("databases", {}) 6486 .get("splice", DEFAULT_SPLICE_FOLDER) 6487 ) 6488 splice_databases = full_path(splice_databases) 6489 6490 # Param 6491 param = self.get_param() 6492 log.debug("Param: " + str(param)) 6493 6494 # Param 6495 options = param.get("annotation", {}).get("splice", {}).get("options", {}) 6496 log.debug("Options: " + str(options)) 6497 6498 # Data 6499 table_variants = self.get_table_variants() 6500 6501 # Check if not empty 6502 log.debug("Check if not empty") 6503 sql_query_chromosomes = ( 6504 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6505 ) 6506 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6507 log.info("VCF empty") 6508 return None 6509 6510 # Export in VCF 6511 log.debug("Create initial file to annotate") 6512 6513 # Create output folder / work folder 6514 if options.get("output_folder", ""): 6515 output_folder = options.get("output_folder", "") 6516 if not os.path.exists(output_folder): 6517 Path(output_folder).mkdir(parents=True, exist_ok=True) 6518 else: 6519 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6520 if not os.path.exists(output_folder): 6521 Path(output_folder).mkdir(parents=True, exist_ok=True) 6522 6523 if options.get("workdir", ""): 6524 workdir = options.get("workdir", "") 6525 else: 6526 workdir = "/work" 6527 6528 # Create tmp VCF file 6529 tmp_vcf = NamedTemporaryFile( 6530 prefix=self.get_prefix(), 6531 dir=output_folder, 6532 suffix=".vcf", 6533 delete=False, 6534 ) 6535 tmp_vcf_name = tmp_vcf.name 6536 6537 # VCF header 6538 header = self.get_header() 6539 6540 # Existing annotations 6541 for vcf_annotation in self.get_header().infos: 6542 6543 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6544 log.debug( 6545 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6546 ) 6547 6548 # Memory limit 6549 if config.get("memory", None): 6550 memory_limit = config.get("memory", "8G").upper() 6551 # upper() 6552 else: 6553 memory_limit = "8G" 6554 log.debug(f"memory_limit: {memory_limit}") 6555 6556 # Check number of variants to annotate 6557 where_clause_regex_spliceai = r"SpliceAI_\w+" 6558 where_clause_regex_spip = r"SPiP_\w+" 6559 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6560 df_list_of_variants_to_annotate = self.get_query_to_df( 6561 query=f""" SELECT * FROM variants {where_clause} """ 6562 ) 6563 if len(df_list_of_variants_to_annotate) == 0: 6564 log.warning( 6565 f"No variants to annotate with splice. Variants probably already annotated with splice" 6566 ) 6567 return None 6568 else: 6569 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6570 6571 # Export VCF file 6572 self.export_variant_vcf( 6573 vcf_file=tmp_vcf_name, 6574 remove_info=True, 6575 add_samples=True, 6576 index=False, 6577 where_clause=where_clause, 6578 ) 6579 mount = [f" -v {path}:{path}:rw" for path in [output_folder]] 6580 if any(value for value in splice_config.values() if value is None): 6581 log.warning("At least one splice config parameter is empty") 6582 # exit annotation_splice 6583 return None 6584 6585 # Params in splice nf 6586 def check_values(dico: dict): 6587 """ 6588 Ensure parameters for NF splice pipeline 6589 """ 6590 for key, val in dico.items(): 6591 if key == "genome": 6592 if any( 6593 assemb in options.get("genome", {}) 6594 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6595 ): 6596 yield f"--{key} hg19" 6597 elif any( 6598 assemb in options.get("genome", {}) 6599 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6600 ): 6601 yield f"--{key} hg38" 6602 elif ( 6603 (isinstance(val, str) and val) 6604 or isinstance(val, int) 6605 or isinstance(val, bool) 6606 ): 6607 yield f"--{key} {val}" 6608 6609 # Genome 6610 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6611 options["genome"] = genome 6612 # NF params 6613 nf_params = [] 6614 # Add options 6615 if options: 6616 log.debug(options) 6617 nf_params = list(check_values(options)) 6618 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6619 else: 6620 log.debug("No NF params provided") 6621 # Add threads 6622 if "threads" not in options.keys(): 6623 nf_params.append(f"--threads {threads}") 6624 # Genome path 6625 genome_path = find_genome( 6626 config.get("folders", {}) 6627 .get("databases", {}) 6628 .get("genomes", DEFAULT_GENOME_FOLDER), 6629 file=f"{genome}.fa", 6630 ) 6631 # Add genome path 6632 if not genome_path: 6633 raise ValueError( 6634 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6635 ) 6636 else: 6637 log.debug(f"Genome: {genome_path}") 6638 nf_params.append(f"--genome_path {genome_path}") 6639 6640 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6641 """ 6642 Setting up updated databases for SPiP and SpliceAI 6643 """ 6644 6645 try: 6646 6647 # SpliceAI assembly transcriptome 6648 spliceai_assembly = os.path.join( 6649 config.get("folders", {}).get("databases", {}).get("spliceai", {}), 6650 options.get("genome"), 6651 "transcriptome", 6652 ) 6653 spip_assembly = options.get("genome") 6654 6655 spip = find( 6656 f"transcriptome_{spip_assembly}.RData", 6657 config.get("folders", {}).get("databases", {}).get("spip", {}), 6658 ) 6659 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6660 log.debug(f"SPiP annotations: {spip}") 6661 log.debug(f"SpliceAI annotations: {spliceai}") 6662 if spip and spliceai: 6663 return [ 6664 f"--spip_transcriptome {spip}", 6665 f"--spliceai_transcriptome {spliceai}", 6666 ] 6667 else: 6668 log.warning( 6669 "Can't find splice databases in configuration, use annotations file from image" 6670 ) 6671 except TypeError: 6672 log.warning( 6673 "Can't find splice databases in configuration, use annotations file from image" 6674 ) 6675 return [] 6676 6677 # Add options, check if transcriptome option have already beend provided 6678 if ( 6679 "spip_transcriptome" not in nf_params 6680 and "spliceai_transcriptome" not in nf_params 6681 ): 6682 splice_reference = splice_annotations(options, config) 6683 if splice_reference: 6684 nf_params.extend(splice_reference) 6685 # nf_params.append(f"--output_folder {output_folder}") 6686 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6687 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6688 log.debug(cmd) 6689 splice_config["docker"]["command"] = cmd 6690 6691 # Ensure proxy is set 6692 proxy = [ 6693 f"-e {var}={os.getenv(var)}" 6694 for var in ["https_proxy", "http_proxy", "ftp_proxy"] 6695 if os.getenv(var) is not None 6696 ] 6697 docker_cmd = get_bin_command( 6698 tool="splice", 6699 bin_type="docker", 6700 config=config, 6701 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6702 add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}", 6703 ) 6704 # print(docker_cmd) 6705 # exit() 6706 # Docker debug 6707 # if splice_config.get("rm_container"): 6708 # rm_container = "--rm" 6709 # else: 6710 # rm_container = "" 6711 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6712 log.debug(docker_cmd) 6713 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6714 log.debug(res.stdout) 6715 if res.stderr: 6716 log.error(res.stderr) 6717 res.check_returncode() 6718 # Update variants 6719 log.info("Annotation - Updating...") 6720 # Test find output vcf 6721 log.debug( 6722 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6723 ) 6724 output_vcf = [] 6725 # Wrong folder to look in 6726 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6727 if ( 6728 files 6729 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6730 ): 6731 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6732 # log.debug(os.listdir(options.get("output_folder"))) 6733 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6734 if not output_vcf: 6735 log.debug( 6736 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6737 ) 6738 else: 6739 # Get new header from annotated vcf 6740 log.debug(f"Initial header: {len(header.infos)} fields") 6741 # Create new header with splice infos 6742 new_vcf = Variants(input=output_vcf[0]) 6743 new_vcf_header = new_vcf.get_header().infos 6744 for keys, infos in new_vcf_header.items(): 6745 if keys not in header.infos.keys(): 6746 header.infos[keys] = infos 6747 log.debug(f"New header: {len(header.infos)} fields") 6748 log.debug(f"Splice tmp output: {output_vcf[0]}") 6749 self.update_from_vcf(output_vcf[0]) 6750 6751 # Remove file 6752 remove_if_exists(output_vcf) 6753 6754 ### 6755 # Prioritization 6756 ### 6757 6758 def get_config_default(self, name: str) -> dict: 6759 """ 6760 The function `get_config_default` returns a dictionary containing default configurations for 6761 various calculations and prioritizations. 6762 6763 :param name: The `get_config_default` function returns a dictionary containing default 6764 configurations for different calculations and prioritizations. The `name` parameter is used to 6765 specify which specific configuration to retrieve from the dictionary 6766 :type name: str 6767 :return: The function `get_config_default` returns a dictionary containing default configuration 6768 settings for different calculations and prioritizations. The specific configuration settings are 6769 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6770 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6771 returned. If there is no match, an empty dictionary is returned. 6772 """ 6773 6774 config_default = { 6775 "calculations": { 6776 "variant_chr_pos_alt_ref": { 6777 "type": "sql", 6778 "name": "variant_chr_pos_alt_ref", 6779 "description": "Create a variant ID with chromosome, position, alt and ref", 6780 "available": False, 6781 "output_column_name": "variant_chr_pos_alt_ref", 6782 "output_column_type": "String", 6783 "output_column_description": "variant ID with chromosome, position, alt and ref", 6784 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6785 "operation_info": True, 6786 }, 6787 "VARTYPE": { 6788 "type": "sql", 6789 "name": "VARTYPE", 6790 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6791 "available": True, 6792 "table": "variants", 6793 "output_column_name": "VARTYPE", 6794 "output_column_type": "String", 6795 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6796 "operation_query": """ 6797 CASE 6798 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6799 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6800 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6801 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6802 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6803 ELSE 'UNDEFINED' 6804 END 6805 """, 6806 "info_fields": ["SVTYPE"], 6807 "operation_info": True, 6808 }, 6809 "snpeff_hgvs": { 6810 "type": "python", 6811 "name": "snpeff_hgvs", 6812 "description": "HGVS nomenclatures from snpEff annotation", 6813 "available": True, 6814 "function_name": "calculation_extract_snpeff_hgvs", 6815 "function_params": ["snpeff_hgvs", "ANN"], 6816 }, 6817 "snpeff_ann_explode": { 6818 "type": "python", 6819 "name": "snpeff_ann_explode", 6820 "description": "Explode snpEff annotations with uniquify values", 6821 "available": True, 6822 "function_name": "calculation_snpeff_ann_explode", 6823 "function_params": [False, "fields", "snpeff_", "ANN"], 6824 }, 6825 "snpeff_ann_explode_uniquify": { 6826 "type": "python", 6827 "name": "snpeff_ann_explode_uniquify", 6828 "description": "Explode snpEff annotations", 6829 "available": True, 6830 "function_name": "calculation_snpeff_ann_explode", 6831 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6832 }, 6833 "snpeff_ann_explode_json": { 6834 "type": "python", 6835 "name": "snpeff_ann_explode_json", 6836 "description": "Explode snpEff annotations in JSON format", 6837 "available": True, 6838 "function_name": "calculation_snpeff_ann_explode", 6839 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6840 }, 6841 "NOMEN": { 6842 "type": "python", 6843 "name": "NOMEN", 6844 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)", 6845 "available": True, 6846 "function_name": "calculation_extract_nomen", 6847 "function_params": [], 6848 }, 6849 "RENAME_INFO_FIELDS": { 6850 "type": "python", 6851 "name": "RENAME_INFO_FIELDS", 6852 "description": "Rename or remove INFO/tags", 6853 "available": True, 6854 "function_name": "calculation_rename_info_fields", 6855 "function_params": [], 6856 }, 6857 "FINDBYPIPELINE": { 6858 "type": "python", 6859 "name": "FINDBYPIPELINE", 6860 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6861 "available": True, 6862 "function_name": "calculation_find_by_pipeline", 6863 "function_params": ["findbypipeline"], 6864 }, 6865 "FINDBYSAMPLE": { 6866 "type": "python", 6867 "name": "FINDBYSAMPLE", 6868 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6869 "available": True, 6870 "function_name": "calculation_find_by_pipeline", 6871 "function_params": ["findbysample"], 6872 }, 6873 "GENOTYPECONCORDANCE": { 6874 "type": "python", 6875 "name": "GENOTYPECONCORDANCE", 6876 "description": "Concordance of genotype for multi caller VCF", 6877 "available": True, 6878 "function_name": "calculation_genotype_concordance", 6879 "function_params": [], 6880 }, 6881 "BARCODE": { 6882 "type": "python", 6883 "name": "BARCODE", 6884 "description": "BARCODE as VaRank tool", 6885 "available": True, 6886 "function_name": "calculation_barcode", 6887 "function_params": [], 6888 }, 6889 "BARCODEFAMILY": { 6890 "type": "python", 6891 "name": "BARCODEFAMILY", 6892 "description": "BARCODEFAMILY as VaRank tool", 6893 "available": True, 6894 "function_name": "calculation_barcode_family", 6895 "function_params": ["BCF"], 6896 }, 6897 "TRIO": { 6898 "type": "python", 6899 "name": "TRIO", 6900 "description": "Inheritance for a trio family", 6901 "available": True, 6902 "function_name": "calculation_trio", 6903 "function_params": [], 6904 }, 6905 "VAF": { 6906 "type": "python", 6907 "name": "VAF", 6908 "description": "Variant Allele Frequency (VAF) harmonization", 6909 "available": True, 6910 "function_name": "calculation_vaf_normalization", 6911 "function_params": [], 6912 }, 6913 "VAF_stats": { 6914 "type": "python", 6915 "name": "VAF_stats", 6916 "description": "Variant Allele Frequency (VAF) statistics", 6917 "available": True, 6918 "function_name": "calculation_genotype_stats", 6919 "function_params": ["VAF"], 6920 }, 6921 "DP_stats": { 6922 "type": "python", 6923 "name": "DP_stats", 6924 "description": "Depth (DP) statistics", 6925 "available": True, 6926 "function_name": "calculation_genotype_stats", 6927 "function_params": ["DP"], 6928 }, 6929 "variant_id": { 6930 "type": "python", 6931 "name": "variant_id", 6932 "description": "Variant ID generated from variant position and type", 6933 "available": True, 6934 "function_name": "calculation_variant_id", 6935 "function_params": [], 6936 }, 6937 "transcripts_json": { 6938 "type": "python", 6939 "name": "transcripts_json", 6940 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6941 "available": True, 6942 "function_name": "calculation_transcripts_annotation", 6943 "function_params": ["transcripts_json", None], 6944 }, 6945 "transcripts_ann": { 6946 "type": "python", 6947 "name": "transcripts_ann", 6948 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6949 "available": True, 6950 "function_name": "calculation_transcripts_annotation", 6951 "function_params": [None, "transcripts_ann"], 6952 }, 6953 "transcripts_annotations": { 6954 "type": "python", 6955 "name": "transcripts_annotations", 6956 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6957 "available": True, 6958 "function_name": "calculation_transcripts_annotation", 6959 "function_params": [None, None], 6960 }, 6961 "transcripts_prioritization": { 6962 "type": "python", 6963 "name": "transcripts_prioritization", 6964 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6965 "available": True, 6966 "function_name": "calculation_transcripts_prioritization", 6967 "function_params": [], 6968 }, 6969 "transcripts_export": { 6970 "type": "python", 6971 "name": "transcripts_export", 6972 "description": "Export transcripts table/view as a file (using param.json)", 6973 "available": True, 6974 "function_name": "calculation_transcripts_export", 6975 "function_params": [], 6976 }, 6977 }, 6978 "prioritizations": { 6979 "default": { 6980 "ANN2": [ 6981 { 6982 "type": "contains", 6983 "value": "HIGH", 6984 "score": 5, 6985 "flag": "PASS", 6986 "comment": [ 6987 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6988 ], 6989 }, 6990 { 6991 "type": "contains", 6992 "value": "MODERATE", 6993 "score": 3, 6994 "flag": "PASS", 6995 "comment": [ 6996 "A non-disruptive variant that might change protein effectiveness" 6997 ], 6998 }, 6999 { 7000 "type": "contains", 7001 "value": "LOW", 7002 "score": 0, 7003 "flag": "FILTERED", 7004 "comment": [ 7005 "Assumed to be mostly harmless or unlikely to change protein behavior" 7006 ], 7007 }, 7008 { 7009 "type": "contains", 7010 "value": "MODIFIER", 7011 "score": 0, 7012 "flag": "FILTERED", 7013 "comment": [ 7014 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 7015 ], 7016 }, 7017 ], 7018 } 7019 }, 7020 } 7021 7022 return config_default.get(name, None) 7023 7024 def get_config_json( 7025 self, name: str, config_dict: dict = {}, config_file: str = None 7026 ) -> dict: 7027 """ 7028 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 7029 default values, a dictionary, and a file. 7030 7031 :param name: The `name` parameter in the `get_config_json` function is a string that represents 7032 the name of the configuration. It is used to identify and retrieve the configuration settings 7033 for a specific component or module 7034 :type name: str 7035 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 7036 dictionary that allows you to provide additional configuration settings or overrides. When you 7037 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 7038 the key is the configuration setting you want to override or 7039 :type config_dict: dict 7040 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 7041 specify the path to a configuration file that contains additional settings. If provided, the 7042 function will read the contents of this file and update the configuration dictionary with the 7043 values found in the file, overriding any existing values with the 7044 :type config_file: str 7045 :return: The function `get_config_json` returns a dictionary containing the configuration 7046 settings. 7047 """ 7048 7049 # Create with default prioritizations 7050 config_default = self.get_config_default(name=name) 7051 configuration = config_default 7052 # log.debug(f"configuration={configuration}") 7053 7054 # Replace prioritizations from dict 7055 for config in config_dict: 7056 configuration[config] = config_dict[config] 7057 7058 # Replace prioritizations from file 7059 config_file = full_path(config_file) 7060 if config_file: 7061 if os.path.exists(config_file): 7062 with open(config_file) as config_file_content: 7063 config_file_dict = yaml.safe_load(config_file_content) 7064 for config in config_file_dict: 7065 configuration[config] = config_file_dict[config] 7066 else: 7067 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 7068 log.error(msg_error) 7069 raise ValueError(msg_error) 7070 7071 return configuration 7072 7073 def prioritization( 7074 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 7075 ) -> bool: 7076 """ 7077 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 7078 prioritizes variants based on configured profiles and criteria. 7079 7080 :param table: The `table` parameter in the `prioritization` function is used to specify the name 7081 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 7082 a table name is provided, the method will prioritize the variants in that specific table 7083 :type table: str 7084 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 7085 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 7086 provided, the code will use a default prefix value of "PZ" 7087 :type pz_prefix: str 7088 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 7089 additional parameters specific to the prioritization process. These parameters can include 7090 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 7091 configurations needed for the prioritization of variants in a V 7092 :type pz_param: dict 7093 :return: A boolean value (True) is being returned from the `prioritization` function. 7094 """ 7095 7096 # Config 7097 config = self.get_config() 7098 7099 # Param 7100 param = self.get_param() 7101 7102 # Prioritization param 7103 if pz_param is not None: 7104 prioritization_param = pz_param 7105 else: 7106 prioritization_param = param.get("prioritization", {}) 7107 7108 # Configuration profiles 7109 prioritization_config_file = prioritization_param.get( 7110 "prioritization_config", None 7111 ) 7112 prioritization_config_file = full_path(prioritization_config_file) 7113 prioritizations_config = self.get_config_json( 7114 name="prioritizations", config_file=prioritization_config_file 7115 ) 7116 7117 # Prioritization prefix 7118 pz_prefix_default = "PZ" 7119 if pz_prefix is None: 7120 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 7121 7122 # Prioritization options 7123 profiles = prioritization_param.get("profiles", []) 7124 if isinstance(profiles, str): 7125 profiles = profiles.split(",") 7126 pzfields = prioritization_param.get( 7127 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 7128 ) 7129 if isinstance(pzfields, str): 7130 pzfields = pzfields.split(",") 7131 default_profile = prioritization_param.get("default_profile", None) 7132 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 7133 prioritization_score_mode = prioritization_param.get( 7134 "prioritization_score_mode", "HOWARD" 7135 ) 7136 7137 # Quick Prioritizations 7138 prioritizations = param.get("prioritizations", None) 7139 if prioritizations: 7140 log.info("Quick Prioritization:") 7141 for profile in prioritizations.split(","): 7142 if profile not in profiles: 7143 profiles.append(profile) 7144 log.info(f" {profile}") 7145 7146 # If profile "ALL" provided, all profiles in the config profiles 7147 if "ALL" in profiles: 7148 profiles = list(prioritizations_config.keys()) 7149 7150 for profile in profiles: 7151 if prioritizations_config.get(profile, None): 7152 log.debug(f"Profile '{profile}' configured") 7153 else: 7154 msg_error = f"Profile '{profile}' NOT configured" 7155 log.error(msg_error) 7156 raise ValueError(msg_error) 7157 7158 if profiles: 7159 log.info(f"Prioritization... ") 7160 else: 7161 log.debug(f"No profile defined") 7162 return False 7163 7164 if not default_profile and len(profiles): 7165 default_profile = profiles[0] 7166 7167 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 7168 log.debug("Profiles to check: " + str(list(profiles))) 7169 7170 # Variables 7171 if table is not None: 7172 table_variants = table 7173 else: 7174 table_variants = self.get_table_variants(clause="update") 7175 log.debug(f"Table to prioritize: {table_variants}") 7176 7177 # Added columns 7178 added_columns = [] 7179 7180 # Create list of PZfields 7181 # List of PZFields 7182 list_of_pzfields_original = pzfields + [ 7183 pzfield + pzfields_sep + profile 7184 for pzfield in pzfields 7185 for profile in profiles 7186 ] 7187 list_of_pzfields = [] 7188 log.debug(f"{list_of_pzfields_original}") 7189 7190 # Remove existing PZfields to use if exists 7191 for pzfield in list_of_pzfields_original: 7192 if self.get_header().infos.get(pzfield, None) is None: 7193 list_of_pzfields.append(pzfield) 7194 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 7195 else: 7196 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 7197 7198 if list_of_pzfields: 7199 7200 # Explode Infos prefix 7201 explode_infos_prefix = self.get_explode_infos_prefix() 7202 7203 # PZfields tags description 7204 PZfields_INFOS = { 7205 f"{pz_prefix}Tags": { 7206 "ID": f"{pz_prefix}Tags", 7207 "Number": ".", 7208 "Type": "String", 7209 "Description": "Variant tags based on annotation criteria", 7210 }, 7211 f"{pz_prefix}Score": { 7212 "ID": f"{pz_prefix}Score", 7213 "Number": 1, 7214 "Type": "Integer", 7215 "Description": "Variant score based on annotation criteria", 7216 }, 7217 f"{pz_prefix}Flag": { 7218 "ID": f"{pz_prefix}Flag", 7219 "Number": 1, 7220 "Type": "String", 7221 "Description": "Variant flag based on annotation criteria", 7222 }, 7223 f"{pz_prefix}Comment": { 7224 "ID": f"{pz_prefix}Comment", 7225 "Number": ".", 7226 "Type": "String", 7227 "Description": "Variant comment based on annotation criteria", 7228 }, 7229 f"{pz_prefix}Infos": { 7230 "ID": f"{pz_prefix}Infos", 7231 "Number": ".", 7232 "Type": "String", 7233 "Description": "Variant infos based on annotation criteria", 7234 }, 7235 f"{pz_prefix}Class": { 7236 "ID": f"{pz_prefix}Class", 7237 "Number": ".", 7238 "Type": "String", 7239 "Description": "Variant class based on annotation criteria", 7240 }, 7241 } 7242 7243 # Create INFO fields if not exist 7244 for field in PZfields_INFOS: 7245 field_ID = PZfields_INFOS[field]["ID"] 7246 field_description = PZfields_INFOS[field]["Description"] 7247 if field_ID not in self.get_header().infos and field_ID in pzfields: 7248 field_description = ( 7249 PZfields_INFOS[field]["Description"] 7250 + f", profile {default_profile}" 7251 ) 7252 self.get_header().infos[field_ID] = vcf.parser._Info( 7253 field_ID, 7254 PZfields_INFOS[field]["Number"], 7255 PZfields_INFOS[field]["Type"], 7256 field_description, 7257 "unknown", 7258 "unknown", 7259 code_type_map[PZfields_INFOS[field]["Type"]], 7260 ) 7261 7262 # Create INFO fields if not exist for each profile 7263 for profile in prioritizations_config: 7264 if profile in profiles or profiles == []: 7265 for field in PZfields_INFOS: 7266 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 7267 field_description = ( 7268 PZfields_INFOS[field]["Description"] 7269 + f", profile {profile}" 7270 ) 7271 if ( 7272 field_ID not in self.get_header().infos 7273 and field in pzfields 7274 ): 7275 self.get_header().infos[field_ID] = vcf.parser._Info( 7276 field_ID, 7277 PZfields_INFOS[field]["Number"], 7278 PZfields_INFOS[field]["Type"], 7279 field_description, 7280 "unknown", 7281 "unknown", 7282 code_type_map[PZfields_INFOS[field]["Type"]], 7283 ) 7284 7285 # Header 7286 for pzfield in list_of_pzfields: 7287 if re.match(f"{pz_prefix}Score.*", pzfield): 7288 added_column = self.add_column( 7289 table_name=table_variants, 7290 column_name=pzfield, 7291 column_type="INTEGER", 7292 default_value="0", 7293 ) 7294 elif re.match(f"{pz_prefix}Flag.*", pzfield): 7295 added_column = self.add_column( 7296 table_name=table_variants, 7297 column_name=pzfield, 7298 column_type="BOOLEAN", 7299 default_value="1", 7300 ) 7301 elif re.match(f"{pz_prefix}Class.*", pzfield): 7302 added_column = self.add_column( 7303 table_name=table_variants, 7304 column_name=pzfield, 7305 column_type="VARCHAR[]", 7306 default_value="null", 7307 ) 7308 else: 7309 added_column = self.add_column( 7310 table_name=table_variants, 7311 column_name=pzfield, 7312 column_type="STRING", 7313 default_value="''", 7314 ) 7315 added_columns.append(added_column) 7316 7317 # Profiles 7318 if profiles: 7319 7320 # foreach profile in configuration file 7321 for profile in prioritizations_config: 7322 7323 # If profile is asked in param, or ALL are asked (empty profile []) 7324 if profile in profiles or profiles == []: 7325 log.info(f"Profile '{profile}'") 7326 7327 sql_set_info_option = "" 7328 7329 sql_set_info = [] 7330 7331 # PZ fields set 7332 7333 # PZScore 7334 if ( 7335 f"{pz_prefix}Score{pzfields_sep}{profile}" 7336 in list_of_pzfields 7337 ): 7338 sql_set_info.append( 7339 f""" 7340 concat( 7341 '{pz_prefix}Score{pzfields_sep}{profile}=', 7342 {pz_prefix}Score{pzfields_sep}{profile} 7343 ) 7344 """ 7345 ) 7346 if ( 7347 profile == default_profile 7348 and f"{pz_prefix}Score" in list_of_pzfields 7349 ): 7350 sql_set_info.append( 7351 f""" 7352 concat( 7353 '{pz_prefix}Score=', 7354 {pz_prefix}Score{pzfields_sep}{profile} 7355 ) 7356 """ 7357 ) 7358 7359 # PZFlag 7360 if ( 7361 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7362 in list_of_pzfields 7363 ): 7364 sql_set_info.append( 7365 f""" 7366 concat( 7367 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7368 CASE 7369 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7370 THEN 'PASS' 7371 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7372 THEN 'FILTERED' 7373 END 7374 ) 7375 """ 7376 ) 7377 if ( 7378 profile == default_profile 7379 and f"{pz_prefix}Flag" in list_of_pzfields 7380 ): 7381 sql_set_info.append( 7382 f""" 7383 concat( 7384 '{pz_prefix}Flag=', 7385 CASE 7386 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7387 THEN 'PASS' 7388 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7389 THEN 'FILTERED' 7390 END 7391 ) 7392 """ 7393 ) 7394 7395 # PZClass 7396 if ( 7397 f"{pz_prefix}Class{pzfields_sep}{profile}" 7398 in list_of_pzfields 7399 ): 7400 sql_set_info.append( 7401 f""" 7402 concat( 7403 '{pz_prefix}Class{pzfields_sep}{profile}=', 7404 CASE 7405 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7406 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7407 ELSE '.' 7408 END 7409 ) 7410 7411 """ 7412 ) 7413 if ( 7414 profile == default_profile 7415 and f"{pz_prefix}Class" in list_of_pzfields 7416 ): 7417 sql_set_info.append( 7418 f""" 7419 concat( 7420 '{pz_prefix}Class=', 7421 CASE 7422 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7423 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7424 ELSE '.' 7425 END 7426 ) 7427 """ 7428 ) 7429 7430 # PZComment 7431 if ( 7432 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7433 in list_of_pzfields 7434 ): 7435 sql_set_info.append( 7436 f""" 7437 CASE 7438 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7439 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7440 ELSE '' 7441 END 7442 """ 7443 ) 7444 if ( 7445 profile == default_profile 7446 and f"{pz_prefix}Comment" in list_of_pzfields 7447 ): 7448 sql_set_info.append( 7449 f""" 7450 CASE 7451 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7452 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7453 ELSE '' 7454 END 7455 """ 7456 ) 7457 7458 # PZInfos 7459 if ( 7460 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7461 in list_of_pzfields 7462 ): 7463 sql_set_info.append( 7464 f""" 7465 CASE 7466 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7467 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7468 ELSE '' 7469 END 7470 """ 7471 ) 7472 if ( 7473 profile == default_profile 7474 and f"{pz_prefix}Infos" in list_of_pzfields 7475 ): 7476 sql_set_info.append( 7477 f""" 7478 CASE 7479 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7480 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7481 ELSE '' 7482 END 7483 """ 7484 ) 7485 7486 # Merge PZfields 7487 sql_set_info_option = "" 7488 sql_set_sep = "" 7489 for sql_set in sql_set_info: 7490 if sql_set_sep: 7491 sql_set_info_option += f""" 7492 , concat('{sql_set_sep}', {sql_set}) 7493 """ 7494 else: 7495 sql_set_info_option += f""" 7496 , {sql_set} 7497 """ 7498 sql_set_sep = ";" 7499 7500 sql_queries = [] 7501 for annotation in prioritizations_config[profile]: 7502 7503 # skip special sections 7504 if annotation.startswith("_"): 7505 continue 7506 7507 # For each criterions 7508 for criterion in prioritizations_config[profile][ 7509 annotation 7510 ]: 7511 7512 # Criterion mode 7513 criterion_mode = None 7514 if np.any( 7515 np.isin(list(criterion.keys()), ["type", "value"]) 7516 ): 7517 criterion_mode = "operation" 7518 elif np.any( 7519 np.isin(list(criterion.keys()), ["sql", "fields"]) 7520 ): 7521 criterion_mode = "sql" 7522 log.debug(f"Criterion Mode: {criterion_mode}") 7523 7524 # Criterion parameters 7525 criterion_type = criterion.get("type", None) 7526 criterion_value = criterion.get("value", None) 7527 criterion_sql = criterion.get("sql", None) 7528 criterion_fields = criterion.get("fields", None) 7529 criterion_score = criterion.get("score", 0) 7530 criterion_flag = criterion.get("flag", "PASS") 7531 criterion_class = criterion.get("class", None) 7532 criterion_flag_bool = criterion_flag == "PASS" 7533 criterion_comment = ( 7534 ", ".join(criterion.get("comment", [])) 7535 .replace("'", "''") 7536 .replace(";", ",") 7537 .replace("\t", " ") 7538 ) 7539 criterion_infos = ( 7540 str(criterion) 7541 .replace("'", "''") 7542 .replace(";", ",") 7543 .replace("\t", " ") 7544 ) 7545 7546 # SQL 7547 if criterion_sql is not None and isinstance( 7548 criterion_sql, list 7549 ): 7550 criterion_sql = " ".join(criterion_sql) 7551 7552 # Fields and explode 7553 if criterion_fields is None: 7554 criterion_fields = [annotation] 7555 if not isinstance(criterion_fields, list): 7556 criterion_fields = str(criterion_fields).split(",") 7557 7558 # Class 7559 if criterion_class is not None and not isinstance( 7560 criterion_class, list 7561 ): 7562 criterion_class = str(criterion_class).split(",") 7563 7564 for annotation_field in criterion_fields: 7565 7566 # Explode specific annotation 7567 log.debug( 7568 f"Explode annotation '{annotation_field}'" 7569 ) 7570 added_columns += self.explode_infos( 7571 prefix=explode_infos_prefix, 7572 fields=[annotation_field], 7573 table=table_variants, 7574 ) 7575 extra_infos = self.get_extra_infos( 7576 table=table_variants 7577 ) 7578 7579 # Check if annotation field is present 7580 if ( 7581 f"{explode_infos_prefix}{annotation_field}" 7582 not in extra_infos 7583 ): 7584 msq_err = f"Annotation '{annotation_field}' not in data" 7585 log.error(msq_err) 7586 raise ValueError(msq_err) 7587 else: 7588 log.debug( 7589 f"Annotation '{annotation_field}' in data" 7590 ) 7591 7592 sql_set = [] 7593 sql_set_info = [] 7594 7595 # PZ fields set 7596 7597 # PZScore 7598 if ( 7599 f"{pz_prefix}Score{pzfields_sep}{profile}" 7600 in list_of_pzfields 7601 ): 7602 # VaRank prioritization score mode 7603 if prioritization_score_mode.upper().strip() in [ 7604 "VARANK", 7605 "MAX", 7606 "MAXIMUM", 7607 "TOP", 7608 ]: 7609 sql_set.append( 7610 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END " 7611 ) 7612 # default HOWARD prioritization score mode 7613 else: 7614 sql_set.append( 7615 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7616 ) 7617 7618 # PZFlag 7619 if ( 7620 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7621 in list_of_pzfields 7622 ): 7623 sql_set.append( 7624 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7625 ) 7626 7627 # PZClass 7628 if ( 7629 f"{pz_prefix}Class{pzfields_sep}{profile}" 7630 in list_of_pzfields 7631 and criterion_class is not None 7632 ): 7633 sql_set.append( 7634 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7635 ) 7636 7637 # PZComment 7638 if ( 7639 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7640 in list_of_pzfields 7641 ): 7642 sql_set.append( 7643 f""" 7644 {pz_prefix}Comment{pzfields_sep}{profile} = 7645 concat( 7646 {pz_prefix}Comment{pzfields_sep}{profile}, 7647 CASE 7648 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7649 THEN ', ' 7650 ELSE '' 7651 END, 7652 '{criterion_comment}' 7653 ) 7654 """ 7655 ) 7656 7657 # PZInfos 7658 if ( 7659 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7660 in list_of_pzfields 7661 ): 7662 sql_set.append( 7663 f""" 7664 {pz_prefix}Infos{pzfields_sep}{profile} = 7665 concat( 7666 {pz_prefix}Infos{pzfields_sep}{profile}, 7667 '{criterion_infos}' 7668 ) 7669 """ 7670 ) 7671 sql_set_option = ",".join(sql_set) 7672 7673 # Criterion and comparison 7674 if sql_set_option: 7675 7676 if criterion_mode in ["operation"]: 7677 7678 try: 7679 float(criterion_value) 7680 sql_update = f""" 7681 UPDATE {table_variants} 7682 SET {sql_set_option} 7683 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7684 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7685 """ 7686 except: 7687 contains_option = "" 7688 if criterion_type == "contains": 7689 contains_option = ".*" 7690 sql_update = f""" 7691 UPDATE {table_variants} 7692 SET {sql_set_option} 7693 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7694 """ 7695 sql_queries.append(sql_update) 7696 7697 elif criterion_mode in ["sql"]: 7698 7699 sql_update = f""" 7700 UPDATE {table_variants} 7701 SET {sql_set_option} 7702 WHERE {criterion_sql} 7703 """ 7704 sql_queries.append(sql_update) 7705 7706 else: 7707 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7708 log.error(msg_err) 7709 raise ValueError(msg_err) 7710 7711 else: 7712 log.warning( 7713 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7714 ) 7715 7716 # PZTags 7717 if ( 7718 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7719 in list_of_pzfields 7720 ): 7721 7722 # Create PZFalgs value 7723 pztags_value = "" 7724 pztags_sep_default = "," 7725 pztags_sep = "" 7726 for pzfield in pzfields: 7727 if pzfield not in [f"{pz_prefix}Tags"]: 7728 if ( 7729 f"{pzfield}{pzfields_sep}{profile}" 7730 in list_of_pzfields 7731 ): 7732 if pzfield in [f"{pz_prefix}Flag"]: 7733 pztags_value += f"""{pztags_sep}{pzfield}#', 7734 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7735 THEN 'PASS' 7736 ELSE 'FILTERED' 7737 END, '""" 7738 elif pzfield in [f"{pz_prefix}Class"]: 7739 pztags_value += f"""{pztags_sep}{pzfield}#', 7740 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7741 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7742 ELSE '.' 7743 END, '""" 7744 else: 7745 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7746 pztags_sep = pztags_sep_default 7747 7748 # Add Query update for PZFlags 7749 sql_update_pztags = f""" 7750 UPDATE {table_variants} 7751 SET INFO = concat( 7752 INFO, 7753 CASE WHEN INFO NOT in ('','.') 7754 THEN ';' 7755 ELSE '' 7756 END, 7757 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7758 ) 7759 """ 7760 sql_queries.append(sql_update_pztags) 7761 7762 # Add Query update for PZFlags for default 7763 if profile == default_profile: 7764 sql_update_pztags_default = f""" 7765 UPDATE {table_variants} 7766 SET INFO = concat( 7767 INFO, 7768 ';', 7769 '{pz_prefix}Tags={pztags_value}' 7770 ) 7771 """ 7772 sql_queries.append(sql_update_pztags_default) 7773 7774 log.info(f"""Profile '{profile}' - Prioritization... """) 7775 7776 if sql_queries: 7777 7778 for sql_query in sql_queries: 7779 log.debug( 7780 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7781 ) 7782 self.conn.execute(sql_query) 7783 7784 log.info(f"""Profile '{profile}' - Update... """) 7785 sql_query_update = f""" 7786 UPDATE {table_variants} 7787 SET INFO = 7788 concat( 7789 CASE 7790 WHEN INFO NOT IN ('','.') 7791 THEN concat(INFO, ';') 7792 ELSE '' 7793 END 7794 {sql_set_info_option} 7795 ) 7796 """ 7797 self.conn.execute(sql_query_update) 7798 7799 else: 7800 7801 log.warning(f"No profiles in parameters") 7802 7803 # Remove added columns 7804 for added_column in added_columns: 7805 self.drop_column(column=added_column) 7806 7807 # Explode INFOS fields into table fields 7808 if self.get_explode_infos(): 7809 self.explode_infos( 7810 prefix=self.get_explode_infos_prefix(), 7811 fields=self.get_explode_infos_fields(), 7812 force=True, 7813 ) 7814 7815 return True 7816 7817 ### 7818 # HGVS 7819 ### 7820 7821 def annotation_hgvs(self, threads: int = None) -> None: 7822 """ 7823 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7824 coordinates and alleles. 7825 7826 :param threads: The `threads` parameter is an optional integer that specifies the number of 7827 threads to use for parallel processing. If no value is provided, it will default to the number 7828 of threads obtained from the `get_threads()` method 7829 :type threads: int 7830 """ 7831 7832 # Function for each partition of the Dask Dataframe 7833 def partition_function(partition): 7834 """ 7835 The function `partition_function` applies the `annotation_hgvs_partition` function to 7836 each row of a DataFrame called `partition`. 7837 7838 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7839 to be processed 7840 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7841 the "partition" dataframe along the axis 1. 7842 """ 7843 return partition.apply(annotation_hgvs_partition, axis=1) 7844 7845 def annotation_hgvs_partition(row) -> str: 7846 """ 7847 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7848 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7849 7850 :param row: A dictionary-like object that contains the values for the following keys: 7851 :return: a string that contains the HGVS names associated with the given row of data. 7852 """ 7853 7854 chr = row["CHROM"] 7855 pos = row["POS"] 7856 ref = row["REF"] 7857 alt = row["ALT"] 7858 7859 # Find list of associated transcripts 7860 transcripts_list = list( 7861 polars_conn.execute( 7862 f""" 7863 SELECT transcript 7864 FROM refseq_df 7865 WHERE CHROM='{chr}' 7866 AND POS={pos} 7867 """ 7868 )["transcript"] 7869 ) 7870 7871 # Full HGVS annotation in list 7872 hgvs_full_list = [] 7873 7874 for transcript_name in transcripts_list: 7875 7876 # Transcript 7877 transcript = get_transcript( 7878 transcripts=transcripts, transcript_name=transcript_name 7879 ) 7880 # Exon 7881 if use_exon: 7882 exon = transcript.find_exon_number(pos) 7883 else: 7884 exon = None 7885 # Protein 7886 transcript_protein = None 7887 if use_protein or add_protein or full_format: 7888 transcripts_protein = list( 7889 polars_conn.execute( 7890 f""" 7891 SELECT protein 7892 FROM refseqlink_df 7893 WHERE transcript='{transcript_name}' 7894 LIMIT 1 7895 """ 7896 )["protein"] 7897 ) 7898 if len(transcripts_protein): 7899 transcript_protein = transcripts_protein[0] 7900 7901 # HGVS name 7902 hgvs_name = format_hgvs_name( 7903 chr, 7904 pos, 7905 ref, 7906 alt, 7907 genome=genome, 7908 transcript=transcript, 7909 transcript_protein=transcript_protein, 7910 exon=exon, 7911 use_gene=use_gene, 7912 use_protein=use_protein, 7913 full_format=full_format, 7914 use_version=use_version, 7915 codon_type=codon_type, 7916 ) 7917 hgvs_full_list.append(hgvs_name) 7918 if add_protein and not use_protein and not full_format: 7919 hgvs_name = format_hgvs_name( 7920 chr, 7921 pos, 7922 ref, 7923 alt, 7924 genome=genome, 7925 transcript=transcript, 7926 transcript_protein=transcript_protein, 7927 exon=exon, 7928 use_gene=use_gene, 7929 use_protein=True, 7930 full_format=False, 7931 use_version=use_version, 7932 codon_type=codon_type, 7933 ) 7934 hgvs_full_list.append(hgvs_name) 7935 7936 # Create liste of HGVS annotations 7937 hgvs_full = ",".join(hgvs_full_list) 7938 7939 return hgvs_full 7940 7941 # Polars connexion 7942 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7943 7944 # Config 7945 config = self.get_config() 7946 7947 # Databases 7948 # Genome 7949 databases_genomes_folders = ( 7950 config.get("folders", {}) 7951 .get("databases", {}) 7952 .get("genomes", DEFAULT_GENOME_FOLDER) 7953 ) 7954 databases_genome = ( 7955 config.get("folders", {}).get("databases", {}).get("genomes", "") 7956 ) 7957 # refseq database folder 7958 databases_refseq_folders = ( 7959 config.get("folders", {}) 7960 .get("databases", {}) 7961 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7962 ) 7963 # refseq 7964 databases_refseq = config.get("databases", {}).get("refSeq", None) 7965 # refSeqLink 7966 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7967 7968 # Param 7969 param = self.get_param() 7970 7971 # Quick HGVS 7972 if "hgvs_options" in param and param.get("hgvs_options", ""): 7973 log.info(f"Quick HGVS Annotation:") 7974 if not param.get("hgvs", None): 7975 param["hgvs"] = {} 7976 for option in param.get("hgvs_options", "").split(","): 7977 option_var_val = option.split("=") 7978 option_var = option_var_val[0] 7979 if len(option_var_val) > 1: 7980 option_val = option_var_val[1] 7981 else: 7982 option_val = "True" 7983 if option_val.upper() in ["TRUE"]: 7984 option_val = True 7985 elif option_val.upper() in ["FALSE"]: 7986 option_val = False 7987 log.info(f" {option_var}={option_val}") 7988 param["hgvs"][option_var] = option_val 7989 7990 # Check if HGVS annotation enabled 7991 if "hgvs" in param: 7992 log.info(f"HGVS Annotation... ") 7993 for hgvs_option in param.get("hgvs", {}): 7994 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7995 else: 7996 return 7997 7998 # HGVS Param 7999 param_hgvs = param.get("hgvs", {}) 8000 use_exon = param_hgvs.get("use_exon", False) 8001 use_gene = param_hgvs.get("use_gene", False) 8002 use_protein = param_hgvs.get("use_protein", False) 8003 add_protein = param_hgvs.get("add_protein", False) 8004 full_format = param_hgvs.get("full_format", False) 8005 use_version = param_hgvs.get("use_version", False) 8006 codon_type = param_hgvs.get("codon_type", "3") 8007 8008 # refSseq refSeqLink 8009 databases_refseq = param_hgvs.get("refseq", databases_refseq) 8010 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 8011 8012 # Assembly 8013 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 8014 8015 # Genome 8016 genome_file = None 8017 if find_genome(databases_genome): 8018 genome_file = find_genome(databases_genome) 8019 else: 8020 genome_file = find_genome( 8021 genome_path=databases_genomes_folders, assembly=assembly 8022 ) 8023 log.debug("Genome: " + str(genome_file)) 8024 8025 # refSseq 8026 refseq_file = find_file_prefix( 8027 input_file=databases_refseq, 8028 prefix="ncbiRefSeq", 8029 folder=databases_refseq_folders, 8030 assembly=assembly, 8031 ) 8032 log.debug("refSeq: " + str(refseq_file)) 8033 8034 # refSeqLink 8035 refseqlink_file = find_file_prefix( 8036 input_file=databases_refseqlink, 8037 prefix="ncbiRefSeqLink", 8038 folder=databases_refseq_folders, 8039 assembly=assembly, 8040 ) 8041 log.debug("refSeqLink: " + str(refseqlink_file)) 8042 8043 # Threads 8044 if not threads: 8045 threads = self.get_threads() 8046 log.debug("Threads: " + str(threads)) 8047 8048 # Variables 8049 table_variants = self.get_table_variants(clause="update") 8050 8051 # Get variants SNV and InDel only 8052 query_variants = f""" 8053 SELECT "#CHROM" AS CHROM, POS, REF, ALT 8054 FROM {table_variants} 8055 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 8056 """ 8057 df_variants = self.get_query_to_df(query_variants) 8058 8059 # Added columns 8060 added_columns = [] 8061 8062 # Add hgvs column in variants table 8063 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 8064 added_column = self.add_column( 8065 table_variants, hgvs_column_name, "STRING", default_value=None 8066 ) 8067 added_columns.append(added_column) 8068 8069 log.debug(f"refSeq loading...") 8070 # refSeq in duckDB 8071 refseq_table = get_refseq_table( 8072 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 8073 ) 8074 # Loading all refSeq in Dataframe 8075 refseq_query = f""" 8076 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 8077 FROM {refseq_table} 8078 JOIN df_variants ON ( 8079 {refseq_table}.chrom = df_variants.CHROM 8080 AND {refseq_table}.txStart<=df_variants.POS 8081 AND {refseq_table}.txEnd>=df_variants.POS 8082 ) 8083 """ 8084 refseq_df = self.conn.query(refseq_query).pl() 8085 8086 if refseqlink_file: 8087 log.debug(f"refSeqLink loading...") 8088 # refSeqLink in duckDB 8089 refseqlink_table = get_refseq_table( 8090 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 8091 ) 8092 # Loading all refSeqLink in Dataframe 8093 protacc_column = "protAcc_with_ver" 8094 mrnaacc_column = "mrnaAcc_with_ver" 8095 refseqlink_query = f""" 8096 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 8097 FROM {refseqlink_table} 8098 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 8099 WHERE protAcc_without_ver IS NOT NULL 8100 """ 8101 # Polars Dataframe 8102 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 8103 8104 # Read RefSeq transcripts into a python dict/model. 8105 log.debug(f"Transcripts loading...") 8106 with tempfile.TemporaryDirectory() as tmpdir: 8107 transcripts_query = f""" 8108 COPY ( 8109 SELECT {refseq_table}.* 8110 FROM {refseq_table} 8111 JOIN df_variants ON ( 8112 {refseq_table}.chrom=df_variants.CHROM 8113 AND {refseq_table}.txStart<=df_variants.POS 8114 AND {refseq_table}.txEnd>=df_variants.POS 8115 ) 8116 ) 8117 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 8118 """ 8119 self.conn.query(transcripts_query) 8120 with open(f"{tmpdir}/transcript.tsv") as infile: 8121 transcripts = read_transcripts(infile) 8122 8123 # Polars connexion 8124 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8125 8126 log.debug("Genome loading...") 8127 # Read genome sequence using pyfaidx. 8128 genome = Fasta(genome_file) 8129 8130 log.debug("Start annotation HGVS...") 8131 8132 # Create 8133 # a Dask Dataframe from Pandas dataframe with partition as number of threads 8134 ddf = dd.from_pandas(df_variants, npartitions=threads) 8135 8136 # Use dask.dataframe.apply() to apply function on each partition 8137 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 8138 8139 # Convert Dask DataFrame to Pandas Dataframe 8140 df = ddf.compute() 8141 8142 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 8143 with tempfile.TemporaryDirectory() as tmpdir: 8144 df_parquet = os.path.join(tmpdir, "df.parquet") 8145 df.to_parquet(df_parquet) 8146 8147 # Update hgvs column 8148 update_variant_query = f""" 8149 UPDATE {table_variants} 8150 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 8151 FROM read_parquet('{df_parquet}') as df 8152 WHERE variants."#CHROM" = df.CHROM 8153 AND variants.POS = df.POS 8154 AND variants.REF = df.REF 8155 AND variants.ALT = df.ALT 8156 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 8157 """ 8158 self.execute_query(update_variant_query) 8159 8160 # Update INFO column 8161 sql_query_update = f""" 8162 UPDATE {table_variants} 8163 SET INFO = 8164 concat( 8165 CASE 8166 WHEN INFO NOT IN ('','.') 8167 THEN concat(INFO, ';') 8168 ELSE '' 8169 END, 8170 'hgvs=', 8171 {hgvs_column_name} 8172 ) 8173 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 8174 """ 8175 self.execute_query(sql_query_update) 8176 8177 # Add header 8178 HGVS_INFOS = { 8179 "hgvs": { 8180 "ID": "hgvs", 8181 "Number": ".", 8182 "Type": "String", 8183 "Description": f"HGVS annotatation with HOWARD", 8184 } 8185 } 8186 8187 for field in HGVS_INFOS: 8188 field_ID = HGVS_INFOS[field]["ID"] 8189 field_description = HGVS_INFOS[field]["Description"] 8190 self.get_header().infos[field_ID] = vcf.parser._Info( 8191 field_ID, 8192 HGVS_INFOS[field]["Number"], 8193 HGVS_INFOS[field]["Type"], 8194 field_description, 8195 "unknown", 8196 "unknown", 8197 code_type_map[HGVS_INFOS[field]["Type"]], 8198 ) 8199 8200 # Remove added columns 8201 for added_column in added_columns: 8202 self.drop_column(column=added_column) 8203 8204 ### 8205 # Calculation 8206 ### 8207 8208 def get_operations_help( 8209 self, operations_config_dict: dict = {}, operations_config_file: str = None 8210 ) -> list: 8211 8212 # Init 8213 operations_help = [] 8214 8215 # operations 8216 operations = self.get_config_json( 8217 name="calculations", 8218 config_dict=operations_config_dict, 8219 config_file=operations_config_file, 8220 ) 8221 for op in operations: 8222 op_name = operations[op].get("name", op).upper() 8223 op_description = operations[op].get("description", op_name) 8224 op_available = operations[op].get("available", False) 8225 if op_available: 8226 operations_help.append(f" {op_name}: {op_description}") 8227 8228 # Sort operations 8229 operations_help.sort() 8230 8231 # insert header 8232 operations_help.insert(0, "Available calculation operations:") 8233 8234 # Return 8235 return operations_help 8236 8237 def calculation( 8238 self, 8239 operations: dict = {}, 8240 operations_config_dict: dict = {}, 8241 operations_config_file: str = None, 8242 ) -> None: 8243 """ 8244 It takes a list of operations, and for each operation, it checks if it's a python or sql 8245 operation, and then calls the appropriate function 8246 8247 param json example: 8248 "calculation": { 8249 "NOMEN": { 8250 "options": { 8251 "hgvs_field": "hgvs" 8252 }, 8253 "middle" : null 8254 } 8255 """ 8256 8257 # Param 8258 param = self.get_param() 8259 8260 # CHeck operations config file 8261 if operations_config_file is None: 8262 operations_config_file = param.get("calculation", {}).get( 8263 "calculation_config", None 8264 ) 8265 8266 # operations config 8267 operations_config = self.get_config_json( 8268 name="calculations", 8269 config_dict=operations_config_dict, 8270 config_file=operations_config_file, 8271 ) 8272 8273 # Upper keys 8274 operations_config = {k.upper(): v for k, v in operations_config.items()} 8275 8276 # Calculations 8277 8278 # Operations from param 8279 operations = param.get("calculation", {}).get("calculations", operations) 8280 8281 # Quick calculation - add 8282 if param.get("calculations", None): 8283 8284 # List of operations 8285 calculations_list = [ 8286 value.strip() for value in param.get("calculations", "").split(",") 8287 ] 8288 8289 # Log 8290 log.info(f"Quick Calculations:") 8291 for calculation_key in calculations_list: 8292 log.info(f" {calculation_key}") 8293 8294 # Create tmp operations (to keep operation order) 8295 operations_tmp = {} 8296 for calculation_operation in calculations_list: 8297 if calculation_operation.upper() not in operations_tmp: 8298 log.debug( 8299 f"{calculation_operation}.upper() not in {operations_tmp}" 8300 ) 8301 operations_tmp[calculation_operation.upper()] = {} 8302 add_value_into_dict( 8303 dict_tree=operations_tmp, 8304 sections=[ 8305 calculation_operation.upper(), 8306 ], 8307 value=operations.get(calculation_operation.upper(), {}), 8308 ) 8309 # Add operations already in param 8310 for calculation_operation in operations: 8311 if calculation_operation not in operations_tmp: 8312 operations_tmp[calculation_operation] = operations.get( 8313 calculation_operation, {} 8314 ) 8315 8316 # Update operations in param 8317 operations = operations_tmp 8318 8319 # Operations for calculation 8320 if not operations: 8321 operations = param.get("calculation", {}).get("calculations", {}) 8322 8323 if operations: 8324 log.info(f"Calculations...") 8325 8326 # For each operations 8327 for operation_name in operations: 8328 operation_name = operation_name.upper() 8329 if operation_name not in [""]: 8330 if operation_name in operations_config: 8331 log.info(f"Calculation '{operation_name}'") 8332 operation = operations_config[operation_name] 8333 operation_type = operation.get("type", "sql") 8334 if operation_type == "python": 8335 self.calculation_process_function( 8336 operation=operation, operation_name=operation_name 8337 ) 8338 elif operation_type == "sql": 8339 self.calculation_process_sql( 8340 operation=operation, operation_name=operation_name 8341 ) 8342 else: 8343 log.error( 8344 f"Operations config: Type '{operation_type}' NOT available" 8345 ) 8346 raise ValueError( 8347 f"Operations config: Type '{operation_type}' NOT available" 8348 ) 8349 else: 8350 log.error( 8351 f"Operations config: Calculation '{operation_name}' NOT available" 8352 ) 8353 raise ValueError( 8354 f"Operations config: Calculation '{operation_name}' NOT available" 8355 ) 8356 8357 # Explode INFOS fields into table fields 8358 if self.get_explode_infos(): 8359 self.explode_infos( 8360 prefix=self.get_explode_infos_prefix(), 8361 fields=self.get_explode_infos_fields(), 8362 force=True, 8363 ) 8364 8365 def calculation_process_sql( 8366 self, operation: dict, operation_name: str = "unknown" 8367 ) -> None: 8368 """ 8369 The `calculation_process_sql` function takes in a mathematical operation as a string and 8370 performs the operation, updating the specified table with the result. 8371 8372 :param operation: The `operation` parameter is a dictionary that contains information about the 8373 mathematical operation to be performed. It includes the following keys: 8374 :type operation: dict 8375 :param operation_name: The `operation_name` parameter is a string that represents the name of 8376 the mathematical operation being performed. It is used for logging and error handling purposes, 8377 defaults to unknown 8378 :type operation_name: str (optional) 8379 """ 8380 8381 # Operation infos 8382 operation_name = operation.get("name", "unknown") 8383 log.debug(f"process SQL {operation_name}") 8384 output_column_name = operation.get("output_column_name", operation_name) 8385 output_column_type = operation.get("output_column_type", "String") 8386 prefix = operation.get("explode_infos_prefix", "") 8387 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8388 output_column_description = operation.get( 8389 "output_column_description", f"{operation_name} operation" 8390 ) 8391 operation_query = operation.get("operation_query", None) 8392 if isinstance(operation_query, list): 8393 operation_query = " ".join(operation_query) 8394 operation_info_fields = operation.get("info_fields", []) 8395 operation_info_fields_check = operation.get("info_fields_check", False) 8396 operation_info = operation.get("operation_info", True) 8397 operation_table = operation.get( 8398 "table", self.get_table_variants(clause="alter") 8399 ) 8400 8401 # table variants 8402 if operation_table: 8403 table_variants = operation_table 8404 else: 8405 table_variants = self.get_table_variants(clause="alter") 8406 8407 if operation_query: 8408 8409 # Info fields check 8410 operation_info_fields_check_result = True 8411 if operation_info_fields_check: 8412 header_infos = self.get_header().infos 8413 for info_field in operation_info_fields: 8414 operation_info_fields_check_result = ( 8415 operation_info_fields_check_result 8416 and info_field in header_infos 8417 ) 8418 8419 # If info fields available 8420 if operation_info_fields_check_result: 8421 8422 # Added_columns 8423 added_columns = [] 8424 8425 # Create VCF header field 8426 vcf_reader = self.get_header() 8427 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8428 output_column_name, 8429 ".", 8430 output_column_type, 8431 output_column_description, 8432 "howard calculation", 8433 "0", 8434 self.code_type_map.get(output_column_type), 8435 ) 8436 8437 # Explode infos if needed 8438 log.debug(f"calculation_process_sql prefix {prefix}") 8439 added_columns += self.explode_infos( 8440 prefix=prefix, 8441 fields=[output_column_name] + operation_info_fields, 8442 force=False, 8443 table=table_variants, 8444 ) 8445 8446 # Create column 8447 added_column = self.add_column( 8448 table_name=table_variants, 8449 column_name=prefix + output_column_name, 8450 column_type=output_column_type_sql, 8451 default_value="null", 8452 ) 8453 added_columns.append(added_column) 8454 8455 # Operation calculation 8456 try: 8457 8458 # Query to update calculation column 8459 sql_update = f""" 8460 UPDATE {table_variants} 8461 SET "{prefix}{output_column_name}" = ({operation_query}) 8462 """ 8463 self.conn.execute(sql_update) 8464 8465 # Add to INFO 8466 if operation_info: 8467 sql_update_info = f""" 8468 UPDATE {table_variants} 8469 SET "INFO" = 8470 concat( 8471 CASE 8472 WHEN "INFO" IS NOT NULL 8473 THEN concat("INFO", ';') 8474 ELSE '' 8475 END, 8476 '{output_column_name}=', 8477 "{prefix}{output_column_name}" 8478 ) 8479 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8480 """ 8481 self.conn.execute(sql_update_info) 8482 8483 except: 8484 log.error( 8485 f"Operations config: Calculation '{operation_name}' query failed" 8486 ) 8487 raise ValueError( 8488 f"Operations config: Calculation '{operation_name}' query failed" 8489 ) 8490 8491 # Remove added columns 8492 for added_column in added_columns: 8493 log.debug(f"added_column: {added_column}") 8494 self.drop_column(column=added_column) 8495 8496 else: 8497 log.error( 8498 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8499 ) 8500 raise ValueError( 8501 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8502 ) 8503 8504 else: 8505 log.error( 8506 f"Operations config: Calculation '{operation_name}' query NOT defined" 8507 ) 8508 raise ValueError( 8509 f"Operations config: Calculation '{operation_name}' query NOT defined" 8510 ) 8511 8512 def calculation_process_function( 8513 self, operation: dict, operation_name: str = "unknown" 8514 ) -> None: 8515 """ 8516 The `calculation_process_function` takes in an operation dictionary and performs the specified 8517 function with the given parameters. 8518 8519 :param operation: The `operation` parameter is a dictionary that contains information about the 8520 operation to be performed. It has the following keys: 8521 :type operation: dict 8522 :param operation_name: The `operation_name` parameter is a string that represents the name of 8523 the operation being performed. It is used for logging purposes, defaults to unknown 8524 :type operation_name: str (optional) 8525 """ 8526 8527 operation_name = operation["name"] 8528 log.debug(f"process Python {operation_name}") 8529 function_name = operation["function_name"] 8530 function_params = operation["function_params"] 8531 getattr(self, function_name)(*function_params) 8532 8533 def calculation_variant_id(self) -> None: 8534 """ 8535 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8536 updates the INFO field of a variants table with the variant ID. 8537 """ 8538 8539 # variant_id annotation field 8540 variant_id_tag = self.get_variant_id_column() 8541 added_columns = [variant_id_tag] 8542 8543 # variant_id hgvs tags" 8544 vcf_infos_tags = { 8545 variant_id_tag: "howard variant ID annotation", 8546 } 8547 8548 # Variants table 8549 table_variants = self.get_table_variants() 8550 8551 # Header 8552 vcf_reader = self.get_header() 8553 8554 # Add variant_id to header 8555 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8556 variant_id_tag, 8557 ".", 8558 "String", 8559 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8560 "howard calculation", 8561 "0", 8562 self.code_type_map.get("String"), 8563 ) 8564 8565 # Update 8566 sql_update = f""" 8567 UPDATE {table_variants} 8568 SET "INFO" = 8569 concat( 8570 CASE 8571 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8572 THEN '' 8573 ELSE concat("INFO", ';') 8574 END, 8575 '{variant_id_tag}=', 8576 "{variant_id_tag}" 8577 ) 8578 """ 8579 self.conn.execute(sql_update) 8580 8581 # Remove added columns 8582 for added_column in added_columns: 8583 self.drop_column(column=added_column) 8584 8585 def calculation_extract_snpeff_hgvs( 8586 self, 8587 snpeff_hgvs: str = "snpeff_hgvs", 8588 snpeff_field: str = "ANN", 8589 ) -> None: 8590 """ 8591 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8592 annotation field in a VCF file and adds them as a new column in the variants table. 8593 8594 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8595 function is used to specify the name of the column that will store the HGVS nomenclatures 8596 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8597 snpeff_hgvs 8598 :type snpeff_hgvs: str (optional) 8599 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8600 function represents the field in the VCF file that contains SnpEff annotations. This field is 8601 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8602 to ANN 8603 :type snpeff_field: str (optional) 8604 """ 8605 8606 # Snpeff hgvs tags 8607 vcf_infos_tags = { 8608 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8609 } 8610 8611 # Prefix 8612 prefix = self.get_explode_infos_prefix() 8613 if prefix: 8614 prefix = "INFO/" 8615 8616 # snpEff fields 8617 speff_ann_infos = prefix + snpeff_field 8618 speff_hgvs_infos = prefix + snpeff_hgvs 8619 8620 # Variants table 8621 table_variants = self.get_table_variants() 8622 8623 # Header 8624 vcf_reader = self.get_header() 8625 8626 # Add columns 8627 added_columns = [] 8628 8629 # Explode HGVS field in column 8630 added_columns += self.explode_infos(fields=[snpeff_field]) 8631 8632 if snpeff_field in vcf_reader.infos: 8633 8634 log.debug(vcf_reader.infos[snpeff_field]) 8635 8636 # Extract ANN header 8637 ann_description = vcf_reader.infos[snpeff_field].desc 8638 pattern = r"'(.+?)'" 8639 match = re.search(pattern, ann_description) 8640 if match: 8641 ann_header_match = match.group(1).split(" | ") 8642 ann_header_desc = {} 8643 for i in range(len(ann_header_match)): 8644 ann_header_info = "".join( 8645 char for char in ann_header_match[i] if char.isalnum() 8646 ) 8647 ann_header_desc[ann_header_info] = ann_header_match[i] 8648 if not ann_header_desc: 8649 raise ValueError("Invalid header description format") 8650 else: 8651 raise ValueError("Invalid header description format") 8652 8653 # Create variant id 8654 variant_id_column = self.get_variant_id_column() 8655 added_columns += [variant_id_column] 8656 8657 # Create dataframe 8658 dataframe_snpeff_hgvs = self.get_query_to_df( 8659 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8660 ) 8661 8662 # Create main NOMEN column 8663 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8664 speff_ann_infos 8665 ].apply( 8666 lambda x: extract_snpeff_hgvs( 8667 str(x), header=list(ann_header_desc.values()) 8668 ) 8669 ) 8670 8671 # Add snpeff_hgvs to header 8672 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8673 snpeff_hgvs, 8674 ".", 8675 "String", 8676 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8677 "howard calculation", 8678 "0", 8679 self.code_type_map.get("String"), 8680 ) 8681 8682 # Update 8683 sql_update = f""" 8684 UPDATE variants 8685 SET "INFO" = 8686 concat( 8687 CASE 8688 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8689 THEN '' 8690 ELSE concat("INFO", ';') 8691 END, 8692 CASE 8693 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8694 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8695 THEN concat( 8696 '{snpeff_hgvs}=', 8697 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8698 ) 8699 ELSE '' 8700 END 8701 ) 8702 FROM dataframe_snpeff_hgvs 8703 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8704 8705 """ 8706 self.conn.execute(sql_update) 8707 8708 # Delete dataframe 8709 del dataframe_snpeff_hgvs 8710 gc.collect() 8711 8712 else: 8713 8714 log.warning( 8715 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8716 ) 8717 8718 # Remove added columns 8719 for added_column in added_columns: 8720 self.drop_column(column=added_column) 8721 8722 def calculation_snpeff_ann_explode( 8723 self, 8724 uniquify: bool = True, 8725 output_format: str = "fields", 8726 output_prefix: str = "snpeff_", 8727 snpeff_field: str = "ANN", 8728 ) -> None: 8729 """ 8730 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8731 exploding the HGVS field and updating variant information accordingly. 8732 8733 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8734 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8735 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8736 defaults to True 8737 :type uniquify: bool (optional) 8738 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8739 function specifies the format in which the output annotations will be generated. It has a 8740 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8741 format, defaults to fields 8742 :type output_format: str (optional) 8743 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8744 method is used to specify the prefix that will be added to the output annotations generated 8745 during the calculation process. This prefix helps to differentiate the newly added annotations 8746 from existing ones in the output data. By default, the, defaults to ANN_ 8747 :type output_prefix: str (optional) 8748 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8749 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8750 field will be processed to explode the HGVS annotations and update the variant information 8751 accordingly, defaults to ANN 8752 :type snpeff_field: str (optional) 8753 """ 8754 8755 # SnpEff annotation field 8756 snpeff_hgvs = "snpeff_ann_explode" 8757 8758 # Snpeff hgvs tags 8759 vcf_infos_tags = { 8760 snpeff_hgvs: "Explode snpEff annotations", 8761 } 8762 8763 # Prefix 8764 prefix = self.get_explode_infos_prefix() 8765 if prefix: 8766 prefix = "INFO/" 8767 8768 # snpEff fields 8769 speff_ann_infos = prefix + snpeff_field 8770 speff_hgvs_infos = prefix + snpeff_hgvs 8771 8772 # Variants table 8773 table_variants = self.get_table_variants() 8774 8775 # Header 8776 vcf_reader = self.get_header() 8777 8778 # Add columns 8779 added_columns = [] 8780 8781 # Explode HGVS field in column 8782 added_columns += self.explode_infos(fields=[snpeff_field]) 8783 log.debug(f"snpeff_field={snpeff_field}") 8784 log.debug(f"added_columns={added_columns}") 8785 8786 if snpeff_field in vcf_reader.infos: 8787 8788 # Extract ANN header 8789 ann_description = vcf_reader.infos[snpeff_field].desc 8790 pattern = r"'(.+?)'" 8791 match = re.search(pattern, ann_description) 8792 if match: 8793 ann_header_match = match.group(1).split(" | ") 8794 ann_header = [] 8795 ann_header_desc = {} 8796 for i in range(len(ann_header_match)): 8797 ann_header_info = "".join( 8798 char for char in ann_header_match[i] if char.isalnum() 8799 ) 8800 ann_header.append(ann_header_info) 8801 ann_header_desc[ann_header_info] = ann_header_match[i] 8802 if not ann_header_desc: 8803 raise ValueError("Invalid header description format") 8804 else: 8805 raise ValueError("Invalid header description format") 8806 8807 # Create variant id 8808 variant_id_column = self.get_variant_id_column() 8809 added_columns += [variant_id_column] 8810 8811 # Create dataframe 8812 dataframe_snpeff_hgvs = self.get_query_to_df( 8813 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8814 ) 8815 8816 # Create snpEff columns 8817 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8818 speff_ann_infos 8819 ].apply( 8820 lambda x: explode_snpeff_ann( 8821 str(x), 8822 uniquify=uniquify, 8823 output_format=output_format, 8824 prefix=output_prefix, 8825 header=list(ann_header_desc.values()), 8826 ) 8827 ) 8828 8829 # Header 8830 ann_annotations_prefix = "" 8831 if output_format.upper() in ["JSON"]: 8832 ann_annotations_prefix = f"{output_prefix}=" 8833 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8834 output_prefix, 8835 ".", 8836 "String", 8837 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8838 + " - JSON format", 8839 "howard calculation", 8840 "0", 8841 self.code_type_map.get("String"), 8842 ) 8843 else: 8844 for ann_annotation in ann_header: 8845 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8846 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8847 ann_annotation_id, 8848 ".", 8849 "String", 8850 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8851 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8852 "howard calculation", 8853 "0", 8854 self.code_type_map.get("String"), 8855 ) 8856 8857 # Update 8858 sql_update = f""" 8859 UPDATE variants 8860 SET "INFO" = 8861 concat( 8862 CASE 8863 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8864 THEN '' 8865 ELSE concat("INFO", ';') 8866 END, 8867 CASE 8868 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8869 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8870 THEN concat( 8871 '{ann_annotations_prefix}', 8872 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8873 ) 8874 ELSE '' 8875 END 8876 ) 8877 FROM dataframe_snpeff_hgvs 8878 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8879 8880 """ 8881 self.conn.execute(sql_update) 8882 8883 # Delete dataframe 8884 del dataframe_snpeff_hgvs 8885 gc.collect() 8886 8887 else: 8888 8889 log.warning( 8890 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8891 ) 8892 8893 # Remove added columns 8894 for added_column in added_columns: 8895 self.drop_column(column=added_column) 8896 8897 def calculation_extract_nomen(self) -> None: 8898 """ 8899 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8900 """ 8901 8902 # NOMEN field 8903 field_nomen_dict = "NOMEN_DICT" 8904 8905 # NOMEN structure 8906 nomen_dict = { 8907 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8908 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8909 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8910 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8911 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8912 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8913 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8914 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8915 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8916 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8917 } 8918 8919 # Param 8920 param = self.get_param() 8921 8922 # Threads 8923 threads = self.get_threads() 8924 8925 # Prefix 8926 prefix = self.get_explode_infos_prefix() 8927 8928 # Header 8929 vcf_reader = self.get_header() 8930 8931 # Added columns 8932 added_columns = [] 8933 8934 # Get HGVS field 8935 hgvs_field = ( 8936 param.get("calculation", {}) 8937 .get("calculations", {}) 8938 .get("NOMEN", {}) 8939 .get("options", {}) 8940 .get("hgvs_field", "hgvs") 8941 ) 8942 8943 # Get NOMEN pattern 8944 nomen_pattern = ( 8945 param.get("calculation", {}) 8946 .get("calculations", {}) 8947 .get("NOMEN", {}) 8948 .get("options", {}) 8949 .get("pattern", None) 8950 ) 8951 8952 # transcripts list of preference sources 8953 transcripts_sources = {} 8954 8955 # Get transcripts 8956 transcripts_file = ( 8957 param.get("calculation", {}) 8958 .get("calculations", {}) 8959 .get("NOMEN", {}) 8960 .get("options", {}) 8961 .get("transcripts", None) 8962 ) 8963 transcripts_file = full_path(transcripts_file) 8964 if transcripts_file: 8965 if os.path.exists(transcripts_file): 8966 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8967 transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist() 8968 transcripts_sources["file"] = transcripts_from_file 8969 else: 8970 msg_err = f"Transcript file '{transcripts_file}' does NOT exist" 8971 log.error(msg_err) 8972 raise ValueError(msg_err) 8973 8974 # Get transcripts table 8975 transcripts_table = ( 8976 param.get("calculation", {}) 8977 .get("calculations", {}) 8978 .get("NOMEN", {}) 8979 .get("options", {}) 8980 .get("transcripts_table", self.get_table_variants()) 8981 ) 8982 # Get transcripts column 8983 transcripts_column = ( 8984 param.get("calculation", {}) 8985 .get("calculations", {}) 8986 .get("NOMEN", {}) 8987 .get("options", {}) 8988 .get("transcripts_column", None) 8989 ) 8990 8991 if transcripts_table and transcripts_column: 8992 extra_field_transcript = f"{transcripts_table}.{transcripts_column}" 8993 # Explode if not exists 8994 added_columns += self.explode_infos( 8995 fields=[transcripts_column], table=transcripts_table 8996 ) 8997 else: 8998 extra_field_transcript = f"NULL" 8999 9000 # Transcripts of preference source order 9001 transcripts_order = ( 9002 param.get("calculation", {}) 9003 .get("calculations", {}) 9004 .get("NOMEN", {}) 9005 .get("options", {}) 9006 .get("transcripts_order", ["column", "file"]) 9007 ) 9008 9009 # Transcripts from file 9010 transcripts = transcripts_sources.get("file", []) 9011 9012 # Explode HGVS field in column 9013 added_columns += self.explode_infos(fields=[hgvs_field]) 9014 9015 # extra infos 9016 extra_infos = self.get_extra_infos() 9017 extra_field = prefix + hgvs_field 9018 9019 if extra_field in extra_infos: 9020 9021 # Create dataframe 9022 dataframe_hgvs = self.get_query_to_df( 9023 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """ 9024 ) 9025 9026 # Transcripts rank 9027 transcripts_rank = { 9028 transcript: rank for rank, transcript in enumerate(transcripts, start=1) 9029 } 9030 transcripts_len = len(transcripts_rank) 9031 9032 # Create main NOMEN column 9033 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply( 9034 lambda x: find_nomen( 9035 hgvs=x.hgvs, 9036 transcript=x.transcript, 9037 transcripts=transcripts_rank, 9038 pattern=nomen_pattern, 9039 transcripts_source_order=transcripts_order, 9040 transcripts_len=transcripts_len, 9041 ), 9042 axis=1, 9043 ) 9044 9045 # Explode NOMEN Structure and create SQL set for update 9046 sql_nomen_fields = [] 9047 for nomen_field in nomen_dict: 9048 9049 # Create VCF header field 9050 vcf_reader.infos[nomen_field] = vcf.parser._Info( 9051 nomen_field, 9052 ".", 9053 "String", 9054 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 9055 "howard calculation", 9056 "0", 9057 self.code_type_map.get("String"), 9058 ) 9059 9060 # Add field to SQL query update 9061 sql_nomen_fields.append( 9062 f""" 9063 CASE 9064 WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('') 9065 THEN concat( 9066 ';{nomen_field}=', 9067 dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" 9068 ) 9069 ELSE '' 9070 END 9071 """ 9072 ) 9073 9074 # SQL set for update 9075 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 9076 9077 # Update 9078 sql_update = f""" 9079 UPDATE variants 9080 SET "INFO" = 9081 concat( 9082 CASE 9083 WHEN "INFO" IS NULL 9084 THEN '' 9085 ELSE "INFO" 9086 END, 9087 {sql_nomen_fields_set} 9088 ) 9089 FROM dataframe_hgvs 9090 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 9091 AND variants."POS" = dataframe_hgvs."POS" 9092 AND variants."REF" = dataframe_hgvs."REF" 9093 AND variants."ALT" = dataframe_hgvs."ALT" 9094 """ 9095 self.conn.execute(sql_update) 9096 9097 # Delete dataframe 9098 del dataframe_hgvs 9099 gc.collect() 9100 9101 # Remove added columns 9102 for added_column in added_columns: 9103 self.drop_column(column=added_column) 9104 9105 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 9106 """ 9107 The function `calculation_find_by_pipeline` performs a calculation to find the number of 9108 pipeline/sample for a variant and updates the variant information in a VCF file. 9109 9110 :param tag: The `tag` parameter is a string that represents the annotation field for the 9111 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 9112 VCF header and to update the corresponding field in the variants table, defaults to 9113 findbypipeline 9114 :type tag: str (optional) 9115 """ 9116 9117 # if FORMAT and samples 9118 if ( 9119 "FORMAT" in self.get_header_columns_as_list() 9120 and self.get_header_sample_list() 9121 ): 9122 9123 # findbypipeline annotation field 9124 findbypipeline_tag = tag 9125 9126 # VCF infos tags 9127 vcf_infos_tags = { 9128 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 9129 } 9130 9131 # Prefix 9132 prefix = self.get_explode_infos_prefix() 9133 9134 # Field 9135 findbypipeline_infos = prefix + findbypipeline_tag 9136 9137 # Variants table 9138 table_variants = self.get_table_variants() 9139 9140 # Header 9141 vcf_reader = self.get_header() 9142 9143 # Create variant id 9144 variant_id_column = self.get_variant_id_column() 9145 added_columns = [variant_id_column] 9146 9147 # variant_id, FORMAT and samples 9148 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9149 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9150 ) 9151 9152 # Create dataframe 9153 dataframe_findbypipeline = self.get_query_to_df( 9154 f""" SELECT {samples_fields} FROM {table_variants} """ 9155 ) 9156 9157 # Create findbypipeline column 9158 dataframe_findbypipeline[findbypipeline_infos] = ( 9159 dataframe_findbypipeline.apply( 9160 lambda row: findbypipeline( 9161 row, samples=self.get_header_sample_list() 9162 ), 9163 axis=1, 9164 ) 9165 ) 9166 9167 # Add snpeff_hgvs to header 9168 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 9169 findbypipeline_tag, 9170 ".", 9171 "String", 9172 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 9173 "howard calculation", 9174 "0", 9175 self.code_type_map.get("String"), 9176 ) 9177 9178 # Update 9179 sql_update = f""" 9180 UPDATE variants 9181 SET "INFO" = 9182 concat( 9183 CASE 9184 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9185 THEN '' 9186 ELSE concat("INFO", ';') 9187 END, 9188 CASE 9189 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 9190 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 9191 THEN concat( 9192 '{findbypipeline_tag}=', 9193 dataframe_findbypipeline."{findbypipeline_infos}" 9194 ) 9195 ELSE '' 9196 END 9197 ) 9198 FROM dataframe_findbypipeline 9199 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 9200 """ 9201 self.conn.execute(sql_update) 9202 9203 # Remove added columns 9204 for added_column in added_columns: 9205 self.drop_column(column=added_column) 9206 9207 # Delete dataframe 9208 del dataframe_findbypipeline 9209 gc.collect() 9210 9211 def calculation_genotype_concordance(self) -> None: 9212 """ 9213 The function `calculation_genotype_concordance` calculates the genotype concordance for 9214 multi-caller VCF files and updates the variant information in the database. 9215 """ 9216 9217 # if FORMAT and samples 9218 if ( 9219 "FORMAT" in self.get_header_columns_as_list() 9220 and self.get_header_sample_list() 9221 ): 9222 9223 # genotypeconcordance annotation field 9224 genotypeconcordance_tag = "genotypeconcordance" 9225 9226 # VCF infos tags 9227 vcf_infos_tags = { 9228 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 9229 } 9230 9231 # Prefix 9232 prefix = self.get_explode_infos_prefix() 9233 9234 # Field 9235 genotypeconcordance_infos = prefix + genotypeconcordance_tag 9236 9237 # Variants table 9238 table_variants = self.get_table_variants() 9239 9240 # Header 9241 vcf_reader = self.get_header() 9242 9243 # Create variant id 9244 variant_id_column = self.get_variant_id_column() 9245 added_columns = [variant_id_column] 9246 9247 # variant_id, FORMAT and samples 9248 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9249 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9250 ) 9251 9252 # Create dataframe 9253 dataframe_genotypeconcordance = self.get_query_to_df( 9254 f""" SELECT {samples_fields} FROM {table_variants} """ 9255 ) 9256 9257 # Create genotypeconcordance column 9258 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 9259 dataframe_genotypeconcordance.apply( 9260 lambda row: genotypeconcordance( 9261 row, samples=self.get_header_sample_list() 9262 ), 9263 axis=1, 9264 ) 9265 ) 9266 9267 # Add genotypeconcordance to header 9268 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 9269 genotypeconcordance_tag, 9270 ".", 9271 "String", 9272 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 9273 "howard calculation", 9274 "0", 9275 self.code_type_map.get("String"), 9276 ) 9277 9278 # Update 9279 sql_update = f""" 9280 UPDATE variants 9281 SET "INFO" = 9282 concat( 9283 CASE 9284 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9285 THEN '' 9286 ELSE concat("INFO", ';') 9287 END, 9288 CASE 9289 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 9290 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 9291 THEN concat( 9292 '{genotypeconcordance_tag}=', 9293 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 9294 ) 9295 ELSE '' 9296 END 9297 ) 9298 FROM dataframe_genotypeconcordance 9299 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 9300 """ 9301 self.conn.execute(sql_update) 9302 9303 # Remove added columns 9304 for added_column in added_columns: 9305 self.drop_column(column=added_column) 9306 9307 # Delete dataframe 9308 del dataframe_genotypeconcordance 9309 gc.collect() 9310 9311 def calculation_barcode(self, tag: str = "barcode") -> None: 9312 """ 9313 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 9314 updates the INFO field in the file with the calculated barcode values. 9315 9316 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 9317 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 9318 the default tag name is set to "barcode", defaults to barcode 9319 :type tag: str (optional) 9320 """ 9321 9322 # if FORMAT and samples 9323 if ( 9324 "FORMAT" in self.get_header_columns_as_list() 9325 and self.get_header_sample_list() 9326 ): 9327 9328 # barcode annotation field 9329 if not tag: 9330 tag = "barcode" 9331 9332 # VCF infos tags 9333 vcf_infos_tags = { 9334 tag: "barcode calculation (VaRank)", 9335 } 9336 9337 # Prefix 9338 prefix = self.get_explode_infos_prefix() 9339 9340 # Field 9341 barcode_infos = prefix + tag 9342 9343 # Variants table 9344 table_variants = self.get_table_variants() 9345 9346 # Header 9347 vcf_reader = self.get_header() 9348 9349 # Create variant id 9350 variant_id_column = self.get_variant_id_column() 9351 added_columns = [variant_id_column] 9352 9353 # variant_id, FORMAT and samples 9354 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9355 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9356 ) 9357 9358 # Create dataframe 9359 dataframe_barcode = self.get_query_to_df( 9360 f""" SELECT {samples_fields} FROM {table_variants} """ 9361 ) 9362 9363 # Create barcode column 9364 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9365 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 9366 ) 9367 9368 # Add barcode to header 9369 vcf_reader.infos[tag] = vcf.parser._Info( 9370 tag, 9371 ".", 9372 "String", 9373 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 9374 "howard calculation", 9375 "0", 9376 self.code_type_map.get("String"), 9377 ) 9378 9379 # Update 9380 sql_update = f""" 9381 UPDATE {table_variants} 9382 SET "INFO" = 9383 concat( 9384 CASE 9385 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9386 THEN '' 9387 ELSE concat("INFO", ';') 9388 END, 9389 CASE 9390 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 9391 AND dataframe_barcode."{barcode_infos}" NOT NULL 9392 THEN concat( 9393 '{tag}=', 9394 dataframe_barcode."{barcode_infos}" 9395 ) 9396 ELSE '' 9397 END 9398 ) 9399 FROM dataframe_barcode 9400 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9401 """ 9402 self.conn.execute(sql_update) 9403 9404 # Remove added columns 9405 for added_column in added_columns: 9406 self.drop_column(column=added_column) 9407 9408 # Delete dataframe 9409 del dataframe_barcode 9410 gc.collect() 9411 9412 def calculation_barcode_family(self, tag: str = "BCF") -> None: 9413 """ 9414 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 9415 and updates the INFO field in the file with the calculated barcode values. 9416 9417 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 9418 the barcode tag that will be added to the VCF file during the calculation process. If no value 9419 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 9420 :type tag: str (optional) 9421 """ 9422 9423 # if FORMAT and samples 9424 if ( 9425 "FORMAT" in self.get_header_columns_as_list() 9426 and self.get_header_sample_list() 9427 ): 9428 9429 # barcode annotation field 9430 if not tag: 9431 tag = "BCF" 9432 9433 # VCF infos tags 9434 vcf_infos_tags = { 9435 tag: "barcode family calculation", 9436 f"{tag}S": "barcode family samples", 9437 } 9438 9439 # Param 9440 param = self.get_param() 9441 log.debug(f"param={param}") 9442 9443 # Prefix 9444 prefix = self.get_explode_infos_prefix() 9445 9446 # PED param 9447 ped = ( 9448 param.get("calculation", {}) 9449 .get("calculations", {}) 9450 .get("BARCODEFAMILY", {}) 9451 .get("family_pedigree", None) 9452 ) 9453 log.debug(f"ped={ped}") 9454 9455 # Load PED 9456 if ped: 9457 9458 # Pedigree is a file 9459 if isinstance(ped, str) and os.path.exists(full_path(ped)): 9460 log.debug("Pedigree is file") 9461 with open(full_path(ped)) as ped: 9462 ped = yaml.safe_load(ped) 9463 9464 # Pedigree is a string 9465 elif isinstance(ped, str): 9466 log.debug("Pedigree is str") 9467 try: 9468 ped = json.loads(ped) 9469 log.debug("Pedigree is json str") 9470 except ValueError as e: 9471 ped_samples = ped.split(",") 9472 ped = {} 9473 for ped_sample in ped_samples: 9474 ped[ped_sample] = ped_sample 9475 9476 # Pedigree is a dict 9477 elif isinstance(ped, dict): 9478 log.debug("Pedigree is dict") 9479 9480 # Pedigree is not well formatted 9481 else: 9482 msg_error = "Pedigree not well formatted" 9483 log.error(msg_error) 9484 raise ValueError(msg_error) 9485 9486 # Construct list 9487 ped_samples = list(ped.values()) 9488 9489 else: 9490 log.debug("Pedigree not defined. Take all samples") 9491 ped_samples = self.get_header_sample_list() 9492 ped = {} 9493 for ped_sample in ped_samples: 9494 ped[ped_sample] = ped_sample 9495 9496 # Check pedigree 9497 if not ped or len(ped) == 0: 9498 msg_error = f"Error in pedigree: samples {ped_samples}" 9499 log.error(msg_error) 9500 raise ValueError(msg_error) 9501 9502 # Log 9503 log.info( 9504 "Calculation 'BARCODEFAMILY' - Samples: " 9505 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9506 ) 9507 log.debug(f"ped_samples={ped_samples}") 9508 9509 # Field 9510 barcode_infos = prefix + tag 9511 9512 # Variants table 9513 table_variants = self.get_table_variants() 9514 9515 # Header 9516 vcf_reader = self.get_header() 9517 9518 # Create variant id 9519 variant_id_column = self.get_variant_id_column() 9520 added_columns = [variant_id_column] 9521 9522 # variant_id, FORMAT and samples 9523 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9524 [f""" "{sample}" """ for sample in ped_samples] 9525 ) 9526 9527 # Create dataframe 9528 dataframe_barcode = self.get_query_to_df( 9529 f""" SELECT {samples_fields} FROM {table_variants} """ 9530 ) 9531 9532 # Create barcode column 9533 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9534 lambda row: barcode(row, samples=ped_samples), axis=1 9535 ) 9536 9537 # Add barcode family to header 9538 # Add vaf_normalization to header 9539 vcf_reader.formats[tag] = vcf.parser._Format( 9540 id=tag, 9541 num=".", 9542 type="String", 9543 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9544 type_code=self.code_type_map.get("String"), 9545 ) 9546 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9547 id=f"{tag}S", 9548 num=".", 9549 type="String", 9550 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9551 type_code=self.code_type_map.get("String"), 9552 ) 9553 9554 # Update 9555 # for sample in ped_samples: 9556 sql_update_set = [] 9557 for sample in self.get_header_sample_list() + ["FORMAT"]: 9558 if sample in ped_samples: 9559 value = f'dataframe_barcode."{barcode_infos}"' 9560 value_samples = ( 9561 "'" 9562 + ",".join([f""" "{sample}" """ for sample in ped_samples]) 9563 + "'" 9564 ) 9565 ped_samples 9566 elif sample == "FORMAT": 9567 value = f"'{tag}'" 9568 value_samples = f"'{tag}S'" 9569 else: 9570 value = "'.'" 9571 value_samples = "'.'" 9572 format_regex = r"[a-zA-Z0-9\s]" 9573 sql_update_set.append( 9574 f""" 9575 "{sample}" = 9576 concat( 9577 CASE 9578 WHEN {table_variants}."{sample}" = './.' 9579 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9580 ELSE {table_variants}."{sample}" 9581 END, 9582 ':', 9583 {value}, 9584 ':', 9585 {value_samples} 9586 ) 9587 """ 9588 ) 9589 9590 sql_update_set_join = ", ".join(sql_update_set) 9591 sql_update = f""" 9592 UPDATE {table_variants} 9593 SET {sql_update_set_join} 9594 FROM dataframe_barcode 9595 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9596 """ 9597 self.conn.execute(sql_update) 9598 9599 # Remove added columns 9600 for added_column in added_columns: 9601 self.drop_column(column=added_column) 9602 9603 # Delete dataframe 9604 del dataframe_barcode 9605 gc.collect() 9606 9607 def calculation_trio(self) -> None: 9608 """ 9609 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9610 information to the INFO field of each variant. 9611 """ 9612 9613 # if FORMAT and samples 9614 if ( 9615 "FORMAT" in self.get_header_columns_as_list() 9616 and self.get_header_sample_list() 9617 ): 9618 9619 # trio annotation field 9620 trio_tag = "trio" 9621 9622 # VCF infos tags 9623 vcf_infos_tags = { 9624 "trio": "trio calculation", 9625 } 9626 9627 # Param 9628 param = self.get_param() 9629 9630 # Prefix 9631 prefix = self.get_explode_infos_prefix() 9632 9633 # Trio param 9634 trio_ped = ( 9635 param.get("calculation", {}) 9636 .get("calculations", {}) 9637 .get("TRIO", {}) 9638 .get("trio_pedigree", None) 9639 ) 9640 9641 # Load trio 9642 if trio_ped: 9643 9644 # Trio pedigree is a file 9645 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9646 log.debug("TRIO pedigree is file") 9647 with open(full_path(trio_ped)) as trio_ped: 9648 trio_ped = yaml.safe_load(trio_ped) 9649 9650 # Trio pedigree is a string 9651 elif isinstance(trio_ped, str): 9652 log.debug("TRIO pedigree is str") 9653 try: 9654 trio_ped = json.loads(trio_ped) 9655 log.debug("TRIO pedigree is json str") 9656 except ValueError as e: 9657 trio_samples = trio_ped.split(",") 9658 if len(trio_samples) == 3: 9659 trio_ped = { 9660 "father": trio_samples[0], 9661 "mother": trio_samples[1], 9662 "child": trio_samples[2], 9663 } 9664 log.debug("TRIO pedigree is list str") 9665 else: 9666 msg_error = "TRIO pedigree not well formatted" 9667 log.error(msg_error) 9668 raise ValueError(msg_error) 9669 9670 # Trio pedigree is a dict 9671 elif isinstance(trio_ped, dict): 9672 log.debug("TRIO pedigree is dict") 9673 9674 # Trio pedigree is not well formatted 9675 else: 9676 msg_error = "TRIO pedigree not well formatted" 9677 log.error(msg_error) 9678 raise ValueError(msg_error) 9679 9680 # Construct trio list 9681 trio_samples = [ 9682 trio_ped.get("father", ""), 9683 trio_ped.get("mother", ""), 9684 trio_ped.get("child", ""), 9685 ] 9686 9687 else: 9688 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9689 samples_list = self.get_header_sample_list() 9690 if len(samples_list) >= 3: 9691 trio_samples = self.get_header_sample_list()[0:3] 9692 trio_ped = { 9693 "father": trio_samples[0], 9694 "mother": trio_samples[1], 9695 "child": trio_samples[2], 9696 } 9697 else: 9698 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9699 log.error(msg_error) 9700 raise ValueError(msg_error) 9701 9702 # Check trio pedigree 9703 if not trio_ped or len(trio_ped) != 3: 9704 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9705 log.error(msg_error) 9706 raise ValueError(msg_error) 9707 9708 # Log 9709 log.info( 9710 f"Calculation 'TRIO' - Samples: " 9711 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9712 ) 9713 9714 # Field 9715 trio_infos = prefix + trio_tag 9716 9717 # Variants table 9718 table_variants = self.get_table_variants() 9719 9720 # Header 9721 vcf_reader = self.get_header() 9722 9723 # Create variant id 9724 variant_id_column = self.get_variant_id_column() 9725 added_columns = [variant_id_column] 9726 9727 # variant_id, FORMAT and samples 9728 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9729 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9730 ) 9731 9732 # Create dataframe 9733 dataframe_trio = self.get_query_to_df( 9734 f""" SELECT {samples_fields} FROM {table_variants} """ 9735 ) 9736 9737 # Create trio column 9738 dataframe_trio[trio_infos] = dataframe_trio.apply( 9739 lambda row: trio(row, samples=trio_samples), axis=1 9740 ) 9741 9742 # Add trio to header 9743 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9744 trio_tag, 9745 ".", 9746 "String", 9747 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9748 "howard calculation", 9749 "0", 9750 self.code_type_map.get("String"), 9751 ) 9752 9753 # Update 9754 sql_update = f""" 9755 UPDATE {table_variants} 9756 SET "INFO" = 9757 concat( 9758 CASE 9759 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9760 THEN '' 9761 ELSE concat("INFO", ';') 9762 END, 9763 CASE 9764 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9765 AND dataframe_trio."{trio_infos}" NOT NULL 9766 THEN concat( 9767 '{trio_tag}=', 9768 dataframe_trio."{trio_infos}" 9769 ) 9770 ELSE '' 9771 END 9772 ) 9773 FROM dataframe_trio 9774 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9775 """ 9776 self.conn.execute(sql_update) 9777 9778 # Remove added columns 9779 for added_column in added_columns: 9780 self.drop_column(column=added_column) 9781 9782 # Delete dataframe 9783 del dataframe_trio 9784 gc.collect() 9785 9786 def calculation_vaf_normalization(self) -> None: 9787 """ 9788 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9789 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9790 :return: The function does not return anything. 9791 """ 9792 9793 # if FORMAT and samples 9794 if ( 9795 "FORMAT" in self.get_header_columns_as_list() 9796 and self.get_header_sample_list() 9797 ): 9798 9799 # vaf_normalization annotation field 9800 vaf_normalization_tag = "VAF" 9801 9802 # VCF infos tags 9803 vcf_infos_tags = { 9804 "VAF": "VAF Variant Frequency", 9805 } 9806 9807 # Prefix 9808 prefix = self.get_explode_infos_prefix() 9809 9810 # Variants table 9811 table_variants = self.get_table_variants() 9812 9813 # Header 9814 vcf_reader = self.get_header() 9815 9816 # Do not calculate if VAF already exists 9817 if "VAF" in vcf_reader.formats: 9818 log.debug("VAF already on genotypes") 9819 return 9820 9821 # Create variant id 9822 variant_id_column = self.get_variant_id_column() 9823 added_columns = [variant_id_column] 9824 9825 # variant_id, FORMAT and samples 9826 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9827 f""" "{sample}" """ for sample in self.get_header_sample_list() 9828 ) 9829 9830 # Create dataframe 9831 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9832 log.debug(f"query={query}") 9833 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9834 9835 vaf_normalization_set = [] 9836 9837 # for each sample vaf_normalization 9838 for sample in self.get_header_sample_list(): 9839 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9840 lambda row: vaf_normalization(row, sample=sample), axis=1 9841 ) 9842 vaf_normalization_set.append( 9843 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9844 ) 9845 9846 # Add VAF to FORMAT 9847 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9848 "FORMAT" 9849 ].apply(lambda x: str(x) + ":VAF") 9850 vaf_normalization_set.append( 9851 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9852 ) 9853 9854 # Add vaf_normalization to header 9855 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9856 id=vaf_normalization_tag, 9857 num="1", 9858 type="Float", 9859 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9860 type_code=self.code_type_map.get("Float"), 9861 ) 9862 9863 # Create fields to add in INFO 9864 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9865 9866 # Update 9867 sql_update = f""" 9868 UPDATE {table_variants} 9869 SET {sql_vaf_normalization_set} 9870 FROM dataframe_vaf_normalization 9871 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9872 9873 """ 9874 self.conn.execute(sql_update) 9875 9876 # Remove added columns 9877 for added_column in added_columns: 9878 self.drop_column(column=added_column) 9879 9880 # Delete dataframe 9881 del dataframe_vaf_normalization 9882 gc.collect() 9883 9884 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9885 """ 9886 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9887 field in a VCF file and updates the INFO column of the variants table with the calculated 9888 statistics. 9889 9890 :param info: The `info` parameter is a string that represents the type of information for which 9891 genotype statistics are calculated. It is used to generate various VCF info tags for the 9892 statistics, such as the number of occurrences, the list of values, the minimum value, the 9893 maximum value, the mean, the median, defaults to VAF 9894 :type info: str (optional) 9895 """ 9896 9897 # if FORMAT and samples 9898 if ( 9899 "FORMAT" in self.get_header_columns_as_list() 9900 and self.get_header_sample_list() 9901 ): 9902 9903 # vaf_stats annotation field 9904 vaf_stats_tag = info + "_stats" 9905 9906 # VCF infos tags 9907 vcf_infos_tags = { 9908 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9909 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9910 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9911 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9912 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9913 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9914 info 9915 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9916 } 9917 9918 # Prefix 9919 prefix = self.get_explode_infos_prefix() 9920 9921 # Field 9922 vaf_stats_infos = prefix + vaf_stats_tag 9923 9924 # Variants table 9925 table_variants = self.get_table_variants() 9926 9927 # Header 9928 vcf_reader = self.get_header() 9929 9930 # Create variant id 9931 variant_id_column = self.get_variant_id_column() 9932 added_columns = [variant_id_column] 9933 9934 # variant_id, FORMAT and samples 9935 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9936 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9937 ) 9938 9939 # Create dataframe 9940 dataframe_vaf_stats = self.get_query_to_df( 9941 f""" SELECT {samples_fields} FROM {table_variants} """ 9942 ) 9943 9944 # Create vaf_stats column 9945 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9946 lambda row: genotype_stats( 9947 row, samples=self.get_header_sample_list(), info=info 9948 ), 9949 axis=1, 9950 ) 9951 9952 # List of vcf tags 9953 sql_vaf_stats_fields = [] 9954 9955 # Check all VAF stats infos 9956 for stat in vcf_infos_tags: 9957 9958 # Extract stats 9959 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9960 lambda x: dict(x).get(stat, "") 9961 ) 9962 9963 # Add snpeff_hgvs to header 9964 vcf_reader.infos[stat] = vcf.parser._Info( 9965 stat, 9966 ".", 9967 "String", 9968 vcf_infos_tags.get(stat, "genotype statistics"), 9969 "howard calculation", 9970 "0", 9971 self.code_type_map.get("String"), 9972 ) 9973 9974 if len(sql_vaf_stats_fields): 9975 sep = ";" 9976 else: 9977 sep = "" 9978 9979 # Create fields to add in INFO 9980 sql_vaf_stats_fields.append( 9981 f""" 9982 CASE 9983 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9984 THEN concat( 9985 '{sep}{stat}=', 9986 dataframe_vaf_stats."{stat}" 9987 ) 9988 ELSE '' 9989 END 9990 """ 9991 ) 9992 9993 # SQL set for update 9994 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9995 9996 # Update 9997 sql_update = f""" 9998 UPDATE {table_variants} 9999 SET "INFO" = 10000 concat( 10001 CASE 10002 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10003 THEN '' 10004 ELSE concat("INFO", ';') 10005 END, 10006 {sql_vaf_stats_fields_set} 10007 ) 10008 FROM dataframe_vaf_stats 10009 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 10010 10011 """ 10012 self.conn.execute(sql_update) 10013 10014 # Remove added columns 10015 for added_column in added_columns: 10016 self.drop_column(column=added_column) 10017 10018 # Delete dataframe 10019 del dataframe_vaf_stats 10020 gc.collect() 10021 10022 def calculation_transcripts_annotation( 10023 self, info_json: str = None, info_format: str = None 10024 ) -> None: 10025 """ 10026 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 10027 field to it if transcripts are available. 10028 10029 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 10030 is a string parameter that represents the information field to be used in the transcripts JSON. 10031 It is used to specify the JSON format for the transcripts information. If no value is provided 10032 when calling the method, it defaults to " 10033 :type info_json: str 10034 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 10035 method is a string parameter that specifies the format of the information field to be used in 10036 the transcripts JSON. It is used to define the format of the information field 10037 :type info_format: str 10038 """ 10039 10040 # Create transcripts table 10041 transcripts_table = self.create_transcript_view() 10042 10043 # Add info field 10044 if transcripts_table: 10045 self.transcript_view_to_variants( 10046 transcripts_table=transcripts_table, 10047 transcripts_info_field_json=info_json, 10048 transcripts_info_field_format=info_format, 10049 ) 10050 else: 10051 log.info("No Transcripts to process. Check param.json file configuration") 10052 10053 def calculation_transcripts_prioritization(self) -> None: 10054 """ 10055 The function `calculation_transcripts_prioritization` creates a transcripts table and 10056 prioritizes transcripts based on certain criteria. 10057 """ 10058 10059 # Create transcripts table 10060 transcripts_table = self.create_transcript_view() 10061 10062 # Add info field 10063 if transcripts_table: 10064 self.transcripts_prioritization(transcripts_table=transcripts_table) 10065 else: 10066 log.info("No Transcripts to process. Check param.json file configuration") 10067 10068 def calculation_transcripts_export(self) -> None: 10069 """ """ 10070 10071 # Create transcripts table 10072 transcripts_table = self.create_transcript_view() 10073 10074 # Add info field 10075 if transcripts_table: 10076 self.transcripts_export(transcripts_table=transcripts_table) 10077 else: 10078 log.info("No Transcripts to process. Check param.json file configuration") 10079 10080 ############### 10081 # Transcripts # 10082 ############### 10083 10084 def transcripts_export( 10085 self, transcripts_table: str = None, param: dict = {} 10086 ) -> bool: 10087 """ """ 10088 10089 log.debug("Start transcripts export...") 10090 10091 # Param 10092 if not param: 10093 param = self.get_param() 10094 10095 # Param export 10096 param_transcript_export = param.get("transcripts", {}).get("export", {}) 10097 10098 # Output file 10099 transcripts_export_output = param_transcript_export.get("output", None) 10100 10101 if not param_transcript_export or not transcripts_export_output: 10102 log.warning(f"No transcriipts export parameters defined!") 10103 return False 10104 10105 # List of transcripts annotations 10106 query_describe = f""" 10107 SELECT column_name 10108 FROM ( 10109 DESCRIBE SELECT * FROM {transcripts_table} 10110 ) 10111 WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO') 10112 """ 10113 transcripts_annotations_list = list( 10114 self.get_query_to_df(query=query_describe)["column_name"] 10115 ) 10116 10117 # Create transcripts table for export 10118 transcripts_table_export = f"{transcripts_table}_export_" + "".join( 10119 random.choices(string.ascii_uppercase + string.digits, k=10) 10120 ) 10121 query_create_transcripts_table_export = f""" 10122 CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table}) 10123 """ 10124 self.execute_query(query=query_create_transcripts_table_export) 10125 10126 # Output file format 10127 transcripts_export_output_format = get_file_format( 10128 filename=transcripts_export_output 10129 ) 10130 10131 # Format VCF - construct INFO 10132 if transcripts_export_output_format in ["vcf"]: 10133 10134 # Construct query update INFO and header 10135 query_update_info = [] 10136 for field in transcripts_annotations_list: 10137 10138 # If field not in header 10139 if field not in self.get_header_infos_list(): 10140 10141 # Add PZ Transcript in header 10142 self.get_header().infos[field] = vcf.parser._Info( 10143 field, 10144 ".", 10145 "String", 10146 f"Annotation '{field}' from transcript view", 10147 "unknown", 10148 "unknown", 10149 0, 10150 ) 10151 10152 # Add field as INFO/tag 10153 query_update_info.append( 10154 f""" 10155 CASE 10156 WHEN "{field}" IS NOT NULL 10157 THEN concat('{field}=', "{field}", ';') 10158 ELSE '' 10159 END 10160 """ 10161 ) 10162 10163 # Query param 10164 query_update_info_value = ( 10165 f""" concat('', {", ".join(query_update_info)}) """ 10166 ) 10167 query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """ 10168 10169 else: 10170 10171 # Query param 10172 query_update_info_value = f""" NULL """ 10173 query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """ 10174 10175 # Update query INFO column 10176 query_update = f""" 10177 UPDATE {transcripts_table_export} 10178 SET INFO = {query_update_info_value} 10179 10180 """ 10181 self.execute_query(query=query_update) 10182 10183 # Export 10184 self.export_output( 10185 output_file=transcripts_export_output, 10186 query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """, 10187 ) 10188 10189 # Drop transcripts export table 10190 query_drop_transcripts_table_export = f""" 10191 DROP TABLE {transcripts_table_export} 10192 """ 10193 self.execute_query(query=query_drop_transcripts_table_export) 10194 10195 def transcripts_prioritization( 10196 self, transcripts_table: str = None, param: dict = {} 10197 ) -> bool: 10198 """ 10199 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 10200 and updates the variants table with the prioritized information. 10201 10202 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10203 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 10204 This parameter is used to identify the table where the transcripts data is stored for the 10205 prioritization process 10206 :type transcripts_table: str 10207 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 10208 that contains various configuration settings for the prioritization process of transcripts. It 10209 is used to customize the behavior of the prioritization algorithm and includes settings such as 10210 the prefix for prioritization fields, default profiles, and other 10211 :type param: dict 10212 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 10213 transcripts prioritization process is successfully completed, and `False` if there are any 10214 issues or if no profile is defined for transcripts prioritization. 10215 """ 10216 10217 log.debug("Start transcripts prioritization...") 10218 10219 # Param 10220 if not param: 10221 param = self.get_param() 10222 10223 # Variants table 10224 table_variants = self.get_table_variants() 10225 10226 # Transcripts table 10227 if transcripts_table is None: 10228 transcripts_table = self.create_transcript_view( 10229 transcripts_table="transcripts", param=param 10230 ) 10231 if transcripts_table is None: 10232 msg_err = "No Transcripts table availalble" 10233 log.error(msg_err) 10234 raise ValueError(msg_err) 10235 log.debug(f"transcripts_table={transcripts_table}") 10236 10237 # Get transcripts columns 10238 columns_as_list_query = f""" 10239 DESCRIBE {transcripts_table} 10240 """ 10241 columns_as_list = list( 10242 self.get_query_to_df(columns_as_list_query)["column_name"] 10243 ) 10244 10245 # Create INFO if not exists 10246 if "INFO" not in columns_as_list: 10247 query_add_info = f""" 10248 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 10249 """ 10250 self.execute_query(query_add_info) 10251 10252 # Prioritization param and Force only PZ Score and Flag 10253 pz_param = param.get("transcripts", {}).get("prioritization", {}) 10254 10255 # PZ profile by default 10256 pz_profile_default = ( 10257 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 10258 ) 10259 10260 # Exit if no profile 10261 if pz_profile_default is None: 10262 log.warning("No profile defined for transcripts prioritization") 10263 return False 10264 10265 # PZ fields 10266 pz_param_pzfields = {} 10267 10268 # PZ field transcripts 10269 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 10270 10271 # Add PZ Transcript in header 10272 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 10273 pz_fields_transcripts, 10274 ".", 10275 "String", 10276 f"Transcript selected from prioritization process, profile {pz_profile_default}", 10277 "unknown", 10278 "unknown", 10279 code_type_map["String"], 10280 ) 10281 10282 # Mandatory fields 10283 pz_mandatory_fields_list = [ 10284 "Score", 10285 "Flag", 10286 "Tags", 10287 "Comment", 10288 "Infos", 10289 "Class", 10290 ] 10291 pz_mandatory_fields = [] 10292 for pz_mandatory_field in pz_mandatory_fields_list: 10293 pz_mandatory_fields.append( 10294 pz_param.get("pzprefix", "PTZ") + pz_mandatory_field 10295 ) 10296 10297 # PZ fields in param 10298 for pz_field in pz_param.get("pzfields", []): 10299 if pz_field in pz_mandatory_fields_list: 10300 pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = ( 10301 pz_param.get("pzprefix", "PTZ") + pz_field 10302 ) 10303 else: 10304 pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field 10305 pz_param_pzfields[pz_field] = pz_field_new 10306 10307 # Add PZ Transcript in header 10308 self.get_header().infos[pz_field_new] = vcf.parser._Info( 10309 pz_field_new, 10310 ".", 10311 "String", 10312 f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}", 10313 "unknown", 10314 "unknown", 10315 code_type_map["String"], 10316 ) 10317 10318 # PZ fields param 10319 pz_param["pzfields"] = pz_mandatory_fields 10320 10321 # Prioritization 10322 prioritization_result = self.prioritization( 10323 table=transcripts_table, 10324 pz_param=param.get("transcripts", {}).get("prioritization", {}), 10325 ) 10326 if not prioritization_result: 10327 log.warning("Transcripts prioritization not processed") 10328 return False 10329 10330 # PZ fields sql query 10331 query_update_select_list = [] 10332 query_update_concat_list = [] 10333 query_update_order_list = [] 10334 for pz_param_pzfield in set( 10335 list(pz_param_pzfields.keys()) + pz_mandatory_fields 10336 ): 10337 query_update_select_list.append(f" {pz_param_pzfield}, ") 10338 10339 for pz_param_pzfield in pz_param_pzfields: 10340 query_update_concat_list.append( 10341 f""" 10342 , CASE 10343 WHEN {pz_param_pzfield} IS NOT NULL 10344 THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield}) 10345 ELSE '' 10346 END 10347 """ 10348 ) 10349 10350 # Order by 10351 pz_orders = ( 10352 param.get("transcripts", {}) 10353 .get("prioritization", {}) 10354 .get("prioritization_transcripts_order", {}) 10355 ) 10356 if not pz_orders: 10357 pz_orders = { 10358 pz_param.get("pzprefix", "PTZ") + "Flag": "DESC", 10359 pz_param.get("pzprefix", "PTZ") + "Score": "DESC", 10360 } 10361 for pz_order in pz_orders: 10362 query_update_order_list.append( 10363 f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """ 10364 ) 10365 10366 # Fields to explode 10367 fields_to_explode = ( 10368 list(pz_param_pzfields.keys()) 10369 + pz_mandatory_fields 10370 + list(pz_orders.keys()) 10371 ) 10372 # Remove transcript column as a specific transcript column 10373 if "transcript" in fields_to_explode: 10374 fields_to_explode.remove("transcript") 10375 10376 # Fields intranscripts table 10377 query_transcripts_table = f""" 10378 DESCRIBE SELECT * FROM {transcripts_table} 10379 """ 10380 query_transcripts_table = self.get_query_to_df(query=query_transcripts_table) 10381 10382 # Check fields to explode 10383 for field_to_explode in fields_to_explode: 10384 if field_to_explode not in self.get_header_infos_list() + list( 10385 query_transcripts_table.column_name 10386 ): 10387 msg_err = f"INFO/{field_to_explode} NOT IN header" 10388 log.error(msg_err) 10389 raise ValueError(msg_err) 10390 10391 # Explode fields to explode 10392 self.explode_infos( 10393 table=transcripts_table, 10394 fields=fields_to_explode, 10395 ) 10396 10397 # Transcript preference file 10398 transcripts_preference_file = ( 10399 param.get("transcripts", {}) 10400 .get("prioritization", {}) 10401 .get("prioritization_transcripts", {}) 10402 ) 10403 transcripts_preference_file = full_path(transcripts_preference_file) 10404 10405 # Transcript preference forced 10406 transcript_preference_force = ( 10407 param.get("transcripts", {}) 10408 .get("prioritization", {}) 10409 .get("prioritization_transcripts_force", False) 10410 ) 10411 # Transcript version forced 10412 transcript_version_force = ( 10413 param.get("transcripts", {}) 10414 .get("prioritization", {}) 10415 .get("prioritization_transcripts_version_force", False) 10416 ) 10417 10418 # Transcripts Ranking 10419 if transcripts_preference_file: 10420 10421 # Transcripts file to dataframe 10422 if os.path.exists(transcripts_preference_file): 10423 transcripts_preference_dataframe = transcripts_file_to_df( 10424 transcripts_preference_file 10425 ) 10426 else: 10427 log.error( 10428 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10429 ) 10430 raise ValueError( 10431 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10432 ) 10433 10434 # Order by depending to transcript preference forcing 10435 if transcript_preference_force: 10436 order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """ 10437 else: 10438 order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """ 10439 10440 # Transcript columns joined depend on version consideration 10441 if transcript_version_force: 10442 transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """ 10443 else: 10444 transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """ 10445 10446 # Query ranking for update 10447 query_update_ranking = f""" 10448 SELECT 10449 "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)} 10450 ROW_NUMBER() OVER ( 10451 PARTITION BY "#CHROM", POS, REF, ALT 10452 ORDER BY {order_by} 10453 ) AS rn 10454 FROM {transcripts_table} 10455 LEFT JOIN 10456 ( 10457 SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order 10458 FROM transcripts_preference_dataframe 10459 ) AS transcripts_preference 10460 ON {transcripts_version_join} 10461 """ 10462 10463 else: 10464 10465 # Query ranking for update 10466 query_update_ranking = f""" 10467 SELECT 10468 "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)} 10469 ROW_NUMBER() OVER ( 10470 PARTITION BY "#CHROM", POS, REF, ALT 10471 ORDER BY {" , ".join(query_update_order_list)} 10472 ) AS rn 10473 FROM {transcripts_table} 10474 """ 10475 10476 # Export Transcripts prioritization infos to variants table 10477 query_update = f""" 10478 WITH RankedTranscripts AS ( 10479 {query_update_ranking} 10480 ) 10481 UPDATE {table_variants} 10482 SET 10483 INFO = CONCAT(CASE 10484 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10485 THEN '' 10486 ELSE concat("INFO", ';') 10487 END, 10488 concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)}) 10489 ) 10490 FROM 10491 RankedTranscripts 10492 WHERE 10493 rn = 1 10494 AND variants."#CHROM" = RankedTranscripts."#CHROM" 10495 AND variants."POS" = RankedTranscripts."POS" 10496 AND variants."REF" = RankedTranscripts."REF" 10497 AND variants."ALT" = RankedTranscripts."ALT" 10498 """ 10499 10500 # log.debug(f"query_update={query_update}") 10501 self.execute_query(query=query_update) 10502 10503 # Return 10504 return True 10505 10506 def create_transcript_view_from_columns_map( 10507 self, 10508 transcripts_table: str = "transcripts", 10509 columns_maps: dict = {}, 10510 added_columns: list = [], 10511 temporary_tables: list = None, 10512 annotation_fields: list = None, 10513 column_rename: dict = {}, 10514 column_clean: bool = False, 10515 column_case: str = None, 10516 ) -> tuple[list, list, list]: 10517 """ 10518 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 10519 specified columns mapping for transcripts data. 10520 10521 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10522 of the table where the transcripts data is stored or will be stored in the database. This table 10523 typically contains information about transcripts such as Ensembl transcript IDs, gene names, 10524 scores, predictions, etc. It defaults to "transcripts, defaults to transcripts 10525 :type transcripts_table: str (optional) 10526 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information 10527 about how to map columns from a transcripts table to create a view. Each entry in the 10528 `columns_maps` list represents a mapping configuration for a specific set of columns. It 10529 typically includes details such as the main transcript column and additional information columns 10530 :type columns_maps: dict 10531 :param added_columns: The `added_columns` parameter in the 10532 `create_transcript_view_from_columns_map` function is a list that stores the additional columns 10533 that will be added to the view being created based on the columns map provided. These columns 10534 are generated by exploding the transcript information columns along with the main transcript 10535 column 10536 :type added_columns: list 10537 :param temporary_tables: The `temporary_tables` parameter in the 10538 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 10539 tables created during the process of creating a transcript view from a columns map. These 10540 temporary tables are used to store intermediate results or transformations before the final view 10541 is generated 10542 :type temporary_tables: list 10543 :param annotation_fields: The `annotation_fields` parameter in the 10544 `create_transcript_view_from_columns_map` function is a list that stores the fields that are 10545 used for annotation in the query view creation process. These fields are extracted from the 10546 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 10547 :type annotation_fields: list 10548 :param column_rename: The `column_rename` parameter in the 10549 `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify 10550 custom renaming for columns during the creation of the temporary table view. This parameter 10551 provides a mapping of original column names to the desired renamed column names. By using this 10552 parameter, 10553 :type column_rename: dict 10554 :param column_clean: The `column_clean` parameter in the 10555 `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the 10556 column values should be cleaned or not. If set to `True`, the column values will be cleaned by 10557 removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to 10558 False 10559 :type column_clean: bool (optional) 10560 :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map` 10561 function is used to specify the case transformation to be applied to the columns during the view 10562 creation process. It allows you to control whether the column values should be converted to 10563 lowercase, uppercase, or remain unchanged 10564 :type column_case: str 10565 :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three 10566 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 10567 """ 10568 10569 log.debug("Start transcrpts view creation from columns map...") 10570 10571 # "from_columns_map": [ 10572 # { 10573 # "transcripts_column": "Ensembl_transcriptid", 10574 # "transcripts_infos_columns": [ 10575 # "genename", 10576 # "Ensembl_geneid", 10577 # "LIST_S2_score", 10578 # "LIST_S2_pred", 10579 # ], 10580 # }, 10581 # { 10582 # "transcripts_column": "Ensembl_transcriptid", 10583 # "transcripts_infos_columns": [ 10584 # "genename", 10585 # "VARITY_R_score", 10586 # "Aloft_pred", 10587 # ], 10588 # }, 10589 # ], 10590 10591 # Init 10592 if temporary_tables is None: 10593 temporary_tables = [] 10594 if annotation_fields is None: 10595 annotation_fields = [] 10596 10597 # Variants table 10598 table_variants = self.get_table_variants() 10599 10600 for columns_map in columns_maps: 10601 10602 # Log 10603 log.debug(f"columns_map={columns_map}") 10604 10605 # Transcript column 10606 transcripts_column = columns_map.get("transcripts_column", None) 10607 10608 # Transcripts infos columns 10609 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 10610 10611 # Transcripts infos columns rename 10612 column_rename = columns_map.get("column_rename", column_rename) 10613 10614 # Transcripts infos columns clean 10615 column_clean = columns_map.get("column_clean", column_clean) 10616 10617 # Transcripts infos columns case 10618 column_case = columns_map.get("column_case", column_case) 10619 10620 if transcripts_column is not None: 10621 10622 # Explode 10623 added_columns += self.explode_infos( 10624 fields=[transcripts_column] + transcripts_infos_columns 10625 ) 10626 10627 # View clauses 10628 clause_select_variants = [] 10629 clause_select_tanscripts = [] 10630 for field in [transcripts_column] + transcripts_infos_columns: 10631 10632 # AS field 10633 as_field = field 10634 10635 # Rename 10636 if column_rename: 10637 as_field = column_rename.get(as_field, as_field) 10638 10639 # Clean 10640 if column_clean: 10641 as_field = clean_annotation_field(as_field) 10642 10643 # Case 10644 if column_case: 10645 if column_case.lower() in ["lower"]: 10646 as_field = as_field.lower() 10647 elif column_case.lower() in ["upper"]: 10648 as_field = as_field.upper() 10649 10650 # Clause select Variants 10651 clause_select_variants.append( 10652 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10653 ) 10654 10655 if field in [transcripts_column]: 10656 clause_select_tanscripts.append( 10657 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10658 ) 10659 else: 10660 clause_select_tanscripts.append( 10661 f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """ 10662 ) 10663 annotation_fields.append(as_field) 10664 10665 # Query View 10666 query = f""" 10667 SELECT 10668 "#CHROM", POS, REF, ALT, INFO, 10669 "{transcripts_column}" AS 'transcript', 10670 {", ".join(clause_select_tanscripts)} 10671 FROM ( 10672 SELECT 10673 "#CHROM", POS, REF, ALT, INFO, 10674 {", ".join(clause_select_variants)} 10675 FROM {table_variants} 10676 ) 10677 WHERE "{transcripts_column}" IS NOT NULL 10678 """ 10679 10680 # Create temporary table 10681 temporary_table = transcripts_table + "".join( 10682 random.choices(string.ascii_uppercase + string.digits, k=10) 10683 ) 10684 10685 # # Temporary_tables 10686 # temporary_tables.append(temporary_table) 10687 # query_view = f""" 10688 # CREATE TEMPORARY TABLE {temporary_table} 10689 # AS ({query}) 10690 # """ 10691 # self.execute_query(query=query_view) 10692 10693 # Temporary_tables 10694 temporary_tables.append(temporary_table) 10695 10696 # List of unique #CHROM 10697 query_unique_chrom = f""" 10698 SELECT DISTINCT "#CHROM" 10699 FROM variants 10700 """ 10701 unique_chroms = self.get_query_to_df(query=query_unique_chrom) 10702 10703 # Create table with structure but without data 10704 query_create_table = f""" 10705 CREATE TABLE {temporary_table} 10706 AS ({query} LIMIT 0) 10707 """ 10708 self.execute_query(query=query_create_table) 10709 10710 # Process by #CHROM 10711 for chrom in unique_chroms["#CHROM"]: 10712 10713 # Log 10714 log.debug(f"Processing #CHROM={chrom}") 10715 10716 # Select data by #CHROM 10717 query_chunk = f""" 10718 SELECT * 10719 FROM ({query}) 10720 WHERE "#CHROM" = '{chrom}' 10721 """ 10722 10723 # Insert data 10724 query_insert_chunk = f""" 10725 INSERT INTO {temporary_table} 10726 {query_chunk} 10727 """ 10728 self.execute_query(query=query_insert_chunk) 10729 10730 return added_columns, temporary_tables, annotation_fields 10731 10732 def create_transcript_view_from_column_format( 10733 self, 10734 transcripts_table: str = "transcripts", 10735 column_formats: dict = {}, 10736 temporary_tables: list = None, 10737 annotation_fields: list = None, 10738 column_rename: dict = {}, 10739 column_clean: bool = False, 10740 column_case: str = None, 10741 ) -> tuple[list, list, list]: 10742 """ 10743 The `create_transcript_view_from_column_format` function generates a transcript view based on 10744 specified column formats, adds additional columns and annotation fields, and returns the list of 10745 temporary tables and annotation fields. 10746 10747 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10748 of the table containing the transcripts data. This table will be used as the base table for 10749 creating the transcript view. The default value for this parameter is "transcripts", but you can 10750 provide a different table name if needed, defaults to transcripts 10751 :type transcripts_table: str (optional) 10752 :param column_formats: The `column_formats` parameter is a dictionary that contains information 10753 about the columns to be used for creating the transcript view. Each entry in the dictionary 10754 specifies the mapping between a transcripts column and a transcripts infos column. This 10755 parameter allows you to define how the columns from the transcripts table should be transformed 10756 or mapped 10757 :type column_formats: dict 10758 :param temporary_tables: The `temporary_tables` parameter in the 10759 `create_transcript_view_from_column_format` function is a list that stores the names of 10760 temporary views created during the process of creating a transcript view from a column format. 10761 These temporary views are used to manipulate and extract data before generating the final 10762 transcript view 10763 :type temporary_tables: list 10764 :param annotation_fields: The `annotation_fields` parameter in the 10765 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 10766 that are extracted from the temporary views created during the process. These annotation fields 10767 are obtained by querying the temporary views and extracting the column names excluding specific 10768 columns like `#CH 10769 :type annotation_fields: list 10770 :param column_rename: The `column_rename` parameter in the 10771 `create_transcript_view_from_column_format` function is a dictionary that allows you to specify 10772 custom renaming of columns in the transcripts infos table. By providing a mapping of original 10773 column names to new column names in this dictionary, you can rename specific columns during the 10774 process 10775 :type column_rename: dict 10776 :param column_clean: The `column_clean` parameter in the 10777 `create_transcript_view_from_column_format` function is a boolean flag that determines whether 10778 the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns 10779 will be cleaned during the creation of the transcript view based on the specified column format, 10780 defaults to False 10781 :type column_clean: bool (optional) 10782 :param column_case: The `column_case` parameter in the 10783 `create_transcript_view_from_column_format` function is used to specify the case transformation 10784 to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" 10785 to convert the column names to uppercase or lowercase, respectively 10786 :type column_case: str 10787 :return: The `create_transcript_view_from_column_format` function returns two lists: 10788 `temporary_tables` and `annotation_fields`. 10789 """ 10790 10791 log.debug("Start transcrpts view creation from column format...") 10792 10793 # "from_column_format": [ 10794 # { 10795 # "transcripts_column": "ANN", 10796 # "transcripts_infos_column": "Feature_ID", 10797 # } 10798 # ], 10799 10800 # Init 10801 if temporary_tables is None: 10802 temporary_tables = [] 10803 if annotation_fields is None: 10804 annotation_fields = [] 10805 10806 for column_format in column_formats: 10807 10808 # annotation field and transcript annotation field 10809 annotation_field = column_format.get("transcripts_column", "ANN") 10810 transcript_annotation = column_format.get( 10811 "transcripts_infos_column", "Feature_ID" 10812 ) 10813 10814 # Transcripts infos columns rename 10815 column_rename = column_format.get("column_rename", column_rename) 10816 10817 # Transcripts infos columns clean 10818 column_clean = column_format.get("column_clean", column_clean) 10819 10820 # Transcripts infos columns case 10821 column_case = column_format.get("column_case", column_case) 10822 10823 # Temporary View name 10824 temporary_view_name = transcripts_table + "".join( 10825 random.choices(string.ascii_uppercase + string.digits, k=10) 10826 ) 10827 10828 # Create temporary view name 10829 temporary_view_name = self.annotation_format_to_table( 10830 uniquify=True, 10831 annotation_field=annotation_field, 10832 view_name=temporary_view_name, 10833 annotation_id=transcript_annotation, 10834 column_rename=column_rename, 10835 column_clean=column_clean, 10836 column_case=column_case, 10837 ) 10838 10839 # Annotation fields 10840 if temporary_view_name: 10841 query_annotation_fields = f""" 10842 SELECT * 10843 FROM ( 10844 DESCRIBE SELECT * 10845 FROM {temporary_view_name} 10846 ) 10847 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 10848 """ 10849 df_annotation_fields = self.get_query_to_df( 10850 query=query_annotation_fields 10851 ) 10852 10853 # Add temporary view and annotation fields 10854 temporary_tables.append(temporary_view_name) 10855 annotation_fields += list(set(df_annotation_fields["column_name"])) 10856 10857 return temporary_tables, annotation_fields 10858 10859 def create_transcript_view( 10860 self, 10861 transcripts_table: str = None, 10862 transcripts_table_drop: bool = False, 10863 param: dict = {}, 10864 ) -> str: 10865 """ 10866 The `create_transcript_view` function generates a transcript view by processing data from a 10867 specified table based on provided parameters and structural information. 10868 10869 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 10870 is used to specify the name of the table that will store the final transcript view data. If a table 10871 name is not provided, the function will create a new table to store the transcript view data, and by 10872 default,, defaults to transcripts 10873 :type transcripts_table: str (optional) 10874 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 10875 `create_transcript_view` function is a boolean parameter that determines whether to drop the 10876 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 10877 the function will drop the existing transcripts table if it exists, defaults to False 10878 :type transcripts_table_drop: bool (optional) 10879 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 10880 contains information needed to create a transcript view. It includes details such as the structure 10881 of the transcripts, columns mapping, column formats, and other necessary information for generating 10882 the view. This parameter allows for flexibility and customization 10883 :type param: dict 10884 :return: The `create_transcript_view` function returns the name of the transcripts table that was 10885 created or modified during the execution of the function. 10886 """ 10887 10888 log.debug("Start transcripts view creation...") 10889 10890 # Default 10891 transcripts_table_default = "transcripts" 10892 10893 # Param 10894 if not param: 10895 param = self.get_param() 10896 10897 # Struct 10898 struct = param.get("transcripts", {}).get("struct", None) 10899 10900 # Transcript veresion 10901 transcript_id_remove_version = param.get("transcripts", {}).get( 10902 "transcript_id_remove_version", False 10903 ) 10904 10905 # Transcripts mapping 10906 transcript_id_mapping_file = param.get("transcripts", {}).get( 10907 "transcript_id_mapping_file", None 10908 ) 10909 10910 # Transcripts mapping 10911 transcript_id_mapping_force = param.get("transcripts", {}).get( 10912 "transcript_id_mapping_force", None 10913 ) 10914 10915 # Transcripts table 10916 if transcripts_table is None: 10917 transcripts_table = param.get("transcripts", {}).get( 10918 "table", transcripts_table_default 10919 ) 10920 10921 # Check transcripts table exists 10922 if transcripts_table: 10923 10924 # Query to check if transcripts table exists 10925 query_check_table = f""" 10926 SELECT * 10927 FROM information_schema.tables 10928 WHERE table_name = '{transcripts_table}' 10929 """ 10930 df_check_table = self.get_query_to_df(query=query_check_table) 10931 10932 # Check if transcripts table exists 10933 if len(df_check_table) > 0 and not transcripts_table_drop: 10934 log.debug(f"Table {transcripts_table} exists and not drop option") 10935 return transcripts_table 10936 10937 if struct: 10938 10939 # added_columns 10940 added_columns = [] 10941 10942 # Temporary tables 10943 temporary_tables = [] 10944 10945 # Annotation fields 10946 annotation_fields = [] 10947 10948 # from columns map 10949 columns_maps = struct.get("from_columns_map", []) 10950 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10951 self.create_transcript_view_from_columns_map( 10952 transcripts_table=transcripts_table, 10953 columns_maps=columns_maps, 10954 added_columns=added_columns, 10955 temporary_tables=temporary_tables, 10956 annotation_fields=annotation_fields, 10957 ) 10958 ) 10959 added_columns += added_columns_tmp 10960 temporary_tables += temporary_tables_tmp 10961 annotation_fields += annotation_fields_tmp 10962 10963 # from column format 10964 column_formats = struct.get("from_column_format", []) 10965 temporary_tables_tmp, annotation_fields_tmp = ( 10966 self.create_transcript_view_from_column_format( 10967 transcripts_table=transcripts_table, 10968 column_formats=column_formats, 10969 temporary_tables=temporary_tables, 10970 annotation_fields=annotation_fields, 10971 ) 10972 ) 10973 temporary_tables += temporary_tables_tmp 10974 annotation_fields += annotation_fields_tmp 10975 10976 # Remove some specific fields/column 10977 annotation_fields = list(set(annotation_fields)) 10978 for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]: 10979 if field in annotation_fields: 10980 annotation_fields.remove(field) 10981 10982 # Merge temporary tables query 10983 query_merge = "" 10984 for temporary_table in list(set(temporary_tables)): 10985 10986 # First temporary table 10987 if not query_merge: 10988 query_merge = f""" 10989 SELECT * FROM {temporary_table} 10990 """ 10991 # other temporary table (using UNION) 10992 else: 10993 query_merge += f""" 10994 UNION BY NAME SELECT * FROM {temporary_table} 10995 """ 10996 10997 # transcript table tmp 10998 transcript_table_tmp = "transcripts_tmp" 10999 transcript_table_tmp2 = "transcripts_tmp2" 11000 transcript_table_tmp3 = "transcripts_tmp3" 11001 11002 # Merge on transcript 11003 query_merge_on_transcripts_annotation_fields = [] 11004 11005 # Add transcript list 11006 query_merge_on_transcripts_annotation_fields.append( 11007 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """ 11008 ) 11009 11010 # Aggregate all annotations fields 11011 for annotation_field in set(annotation_fields): 11012 query_merge_on_transcripts_annotation_fields.append( 11013 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """ 11014 ) 11015 11016 # Transcripts mapping 11017 if transcript_id_mapping_file: 11018 11019 # Transcript dataframe 11020 transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe" 11021 transcript_id_mapping_dataframe = transcripts_file_to_df( 11022 transcript_id_mapping_file, column_names=["transcript", "alias"] 11023 ) 11024 11025 # Transcript version remove 11026 if transcript_id_remove_version: 11027 query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped" 11028 query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)" 11029 query_left_join = f""" 11030 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 11031 """ 11032 else: 11033 query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped" 11034 query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript" 11035 query_left_join = f""" 11036 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 11037 """ 11038 11039 # Transcript column for group by merge 11040 query_transcript_merge_group_by = """ 11041 CASE 11042 WHEN transcript_mapped NOT IN ('') 11043 THEN split_part(transcript_mapped, '.', 1) 11044 ELSE split_part(transcript_original, '.', 1) 11045 END 11046 """ 11047 11048 # Merge query 11049 transcripts_tmp2_query = f""" 11050 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)} 11051 FROM ({query_merge}) AS {transcript_table_tmp} 11052 {query_left_join} 11053 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by} 11054 """ 11055 11056 # Retrive columns after mege 11057 transcripts_tmp2_describe_query = f""" 11058 DESCRIBE {transcripts_tmp2_query} 11059 """ 11060 transcripts_tmp2_describe_list = list( 11061 self.get_query_to_df(query=transcripts_tmp2_describe_query)[ 11062 "column_name" 11063 ] 11064 ) 11065 11066 # Create list of columns for select clause 11067 transcripts_tmp2_describe_select_clause = [] 11068 for field in transcripts_tmp2_describe_list: 11069 if field not in [ 11070 "#CHROM", 11071 "POS", 11072 "REF", 11073 "ALT", 11074 "INFO", 11075 "transcript_mapped", 11076 ]: 11077 as_field = field 11078 if field in ["transcript_original"]: 11079 as_field = "transcripts_mapped" 11080 transcripts_tmp2_describe_select_clause.append( 11081 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """ 11082 ) 11083 11084 # Merge with mapping 11085 query_merge_on_transcripts = f""" 11086 SELECT 11087 "#CHROM", POS, REF, ALT, INFO, 11088 CASE 11089 WHEN ANY_VALUE(transcript_mapped) NOT IN ('') 11090 THEN ANY_VALUE(transcript_mapped) 11091 ELSE ANY_VALUE(transcript_original) 11092 END AS transcript, 11093 {", ".join(transcripts_tmp2_describe_select_clause)} 11094 FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2} 11095 GROUP BY "#CHROM", POS, REF, ALT, INFO, 11096 {query_transcript_merge_group_by} 11097 """ 11098 11099 # Add transcript filter from mapping file 11100 if transcript_id_mapping_force: 11101 query_merge_on_transcripts = f""" 11102 SELECT * 11103 FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3} 11104 WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe) 11105 """ 11106 11107 # No transcript mapping 11108 else: 11109 11110 # Remove transcript version 11111 if transcript_id_remove_version: 11112 query_transcript_column = f""" 11113 split_part({transcript_table_tmp}.transcript, '.', 1) 11114 """ 11115 else: 11116 query_transcript_column = """ 11117 transcript 11118 """ 11119 11120 # Query sections 11121 query_transcript_column_select = ( 11122 f"{query_transcript_column} AS transcript" 11123 ) 11124 query_transcript_column_group_by = query_transcript_column 11125 11126 # Query for transcripts view 11127 query_merge_on_transcripts = f""" 11128 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)} 11129 FROM ({query_merge}) AS {transcript_table_tmp} 11130 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} 11131 """ 11132 11133 # Drop transcript view is necessary 11134 if transcripts_table_drop: 11135 query_drop = f""" 11136 DROP TABLE IF EXISTS {transcripts_table}; 11137 """ 11138 self.execute_query(query=query_drop) 11139 11140 # # Merge and create transcript view 11141 # query_create_view = f""" 11142 # CREATE TABLE IF NOT EXISTS {transcripts_table} 11143 # AS {query_merge_on_transcripts} 11144 # """ 11145 # self.execute_query(query=query_create_view) 11146 11147 # Using #CHROM chunk 11148 ###### 11149 11150 # List of unique #CHROM 11151 query_unique_chrom = f""" 11152 SELECT DISTINCT "#CHROM" 11153 FROM variants AS subquery 11154 """ 11155 unique_chroms = self.get_query_to_df(query=query_unique_chrom) 11156 11157 # Create table with structure but without data, if not exists 11158 query_create_table = f""" 11159 CREATE TABLE IF NOT EXISTS {transcripts_table} AS 11160 SELECT * FROM ({query_merge_on_transcripts}) AS subquery LIMIT 0 11161 """ 11162 self.execute_query(query=query_create_table) 11163 11164 # Process by #CHROM 11165 for chrom in unique_chroms["#CHROM"]: 11166 11167 # Log 11168 log.debug(f"Processing #CHROM={chrom}") 11169 11170 # Select data by #CHROM 11171 query_chunk = f""" 11172 SELECT * 11173 FROM ({query_merge_on_transcripts}) 11174 WHERE "#CHROM" = '{chrom}' 11175 """ 11176 11177 # Insert data 11178 query_insert_chunk = f""" 11179 INSERT INTO {transcripts_table} 11180 {query_chunk} 11181 """ 11182 self.execute_query(query=query_insert_chunk) 11183 11184 # Remove temporary tables 11185 if temporary_tables: 11186 for temporary_table in list(set(temporary_tables)): 11187 query_drop_tmp_table = f""" 11188 DROP TABLE IF EXISTS {temporary_table} 11189 """ 11190 self.execute_query(query=query_drop_tmp_table) 11191 11192 # Remove added columns 11193 for added_column in added_columns: 11194 self.drop_column(column=added_column) 11195 11196 else: 11197 11198 transcripts_table = None 11199 11200 return transcripts_table 11201 11202 def annotation_format_to_table( 11203 self, 11204 uniquify: bool = True, 11205 annotation_field: str = "ANN", 11206 annotation_id: str = "Feature_ID", 11207 view_name: str = "transcripts", 11208 column_rename: dict = {}, 11209 column_clean: bool = False, 11210 column_case: str = None, 11211 ) -> str: 11212 """ 11213 The `annotation_format_to_table` function converts annotation data from a VCF file into a 11214 structured table format, ensuring unique values and creating a temporary table for further 11215 processing or analysis. 11216 11217 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure 11218 unique values in the output or not. If set to `True`, the function will make sure that the 11219 output values are unique, defaults to True 11220 :type uniquify: bool (optional) 11221 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file 11222 that contains the annotation information for each variant. This field is used to extract the 11223 annotation details for further processing in the function. By default, it is set to "ANN", 11224 defaults to ANN 11225 :type annotation_field: str (optional) 11226 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method 11227 is used to specify the identifier for the annotation feature. This identifier will be used as a 11228 column name in the resulting table or view that is created based on the annotation data. It 11229 helps in uniquely identifying each annotation entry in the, defaults to Feature_ID 11230 :type annotation_id: str (optional) 11231 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used 11232 to specify the name of the temporary table that will be created to store the transformed 11233 annotation data. This table will hold the extracted information from the annotation field in a 11234 structured format for further processing or analysis. By default,, defaults to transcripts 11235 :type view_name: str (optional) 11236 :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method 11237 is a dictionary that allows you to specify custom renaming for columns. By providing key-value 11238 pairs in this dictionary, you can rename specific columns in the resulting table or view that is 11239 created based on the annotation data. This feature enables 11240 :type column_rename: dict 11241 :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is 11242 a boolean flag that determines whether the annotation field should undergo a cleaning process. 11243 If set to `True`, the function will clean the annotation field before further processing. This 11244 cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults 11245 to False 11246 :type column_clean: bool (optional) 11247 :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is 11248 used to specify the case transformation to be applied to the column names extracted from the 11249 annotation data. It allows you to set the case of the column names to either lowercase or 11250 uppercase for consistency or other specific requirements during the conversion 11251 :type column_case: str 11252 :return: The function `annotation_format_to_table` is returning the name of the view created, 11253 which is stored in the variable `view_name`. 11254 """ 11255 11256 # Annotation field 11257 annotation_format = "annotation_explode" 11258 11259 # Transcript annotation 11260 if column_rename: 11261 annotation_id = column_rename.get(annotation_id, annotation_id) 11262 11263 if column_clean: 11264 annotation_id = clean_annotation_field(annotation_id) 11265 11266 # Prefix 11267 prefix = self.get_explode_infos_prefix() 11268 if prefix: 11269 prefix = "INFO/" 11270 11271 # Annotation fields 11272 annotation_infos = prefix + annotation_field 11273 annotation_format_infos = prefix + annotation_format 11274 11275 # Variants table 11276 table_variants = self.get_table_variants() 11277 11278 # Header 11279 vcf_reader = self.get_header() 11280 11281 # Add columns 11282 added_columns = [] 11283 11284 # Explode HGVS field in column 11285 added_columns += self.explode_infos(fields=[annotation_field]) 11286 11287 if annotation_field in vcf_reader.infos: 11288 11289 # Extract ANN header 11290 ann_description = vcf_reader.infos[annotation_field].desc 11291 pattern = r"'(.+?)'" 11292 match = re.search(pattern, ann_description) 11293 if match: 11294 ann_header_match = match.group(1).split(" | ") 11295 ann_header = [] 11296 ann_header_desc = {} 11297 for i in range(len(ann_header_match)): 11298 ann_header_info = "".join( 11299 char for char in ann_header_match[i] if char.isalnum() 11300 ) 11301 ann_header.append(ann_header_info) 11302 ann_header_desc[ann_header_info] = ann_header_match[i] 11303 if not ann_header_desc: 11304 raise ValueError("Invalid header description format") 11305 else: 11306 raise ValueError("Invalid header description format") 11307 11308 # Create variant id 11309 variant_id_column = self.get_variant_id_column() 11310 added_columns += [variant_id_column] 11311 11312 # Get list of #CHROM 11313 query_unique_chrom = f""" 11314 SELECT DISTINCT "#CHROM" 11315 FROM variants AS subquery 11316 """ 11317 unique_chroms = self.get_query_to_df(query=query_unique_chrom) 11318 11319 # Base for database anontation format 11320 dataframe_annotation_format_base = f""" 11321 SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" 11322 FROM {table_variants} 11323 """ 11324 11325 # Create dataframe for keys column type 11326 dataframe_annotation_format = self.get_query_to_df( 11327 f""" {dataframe_annotation_format_base} LIMIT 1000 """ 11328 ) 11329 11330 # Define a vectorized function to apply explode_annotation_format 11331 vectorized_explode_annotation_format = np.vectorize( 11332 lambda x: explode_annotation_format( 11333 annotation=str(x), 11334 uniquify=uniquify, 11335 output_format="JSON", 11336 prefix="", 11337 header=list(ann_header_desc.values()), 11338 ) 11339 ) 11340 11341 # Assign the exploded annotations back to the dataframe 11342 dataframe_annotation_format[annotation_format_infos] = ( 11343 vectorized_explode_annotation_format( 11344 dataframe_annotation_format[annotation_infos].to_numpy() 11345 ) 11346 ) 11347 11348 # Find keys 11349 query_json = f""" 11350 SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' 11351 FROM dataframe_annotation_format; 11352 """ 11353 df_keys = self.get_query_to_df(query=query_json) 11354 11355 # Check keys 11356 query_json_key = [] 11357 for _, row in df_keys.iterrows(): 11358 11359 # Key 11360 key = row.iloc[0] 11361 key_clean = key 11362 11363 # key rename 11364 if column_rename: 11365 key_clean = column_rename.get(key_clean, key_clean) 11366 11367 # key clean 11368 if column_clean: 11369 key_clean = clean_annotation_field(key_clean) 11370 11371 # Key case 11372 if column_case: 11373 if column_case.lower() in ["lower"]: 11374 key_clean = key_clean.lower() 11375 elif column_case.lower() in ["upper"]: 11376 key_clean = key_clean.upper() 11377 11378 # Type 11379 query_json_type = f""" 11380 SELECT * 11381 FROM ( 11382 SELECT 11383 NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '') AS '{key_clean}' 11384 FROM 11385 dataframe_annotation_format 11386 ) 11387 WHERE "{key_clean}" NOT NULL AND "{key_clean}" NOT IN ('') 11388 """ 11389 11390 # Get DataFrame from query 11391 df_json_type = self.get_query_to_df(query=query_json_type) 11392 11393 # Detect column type 11394 column_type = detect_column_type(df_json_type[key_clean]) 11395 11396 # Free up memory 11397 del df_json_type 11398 11399 # Append 11400 query_json_key.append( 11401 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 11402 ) 11403 11404 # Create table with structure but without data, if not exists 11405 query_create_table = f""" 11406 CREATE TABLE IF NOT EXISTS {view_name} 11407 AS ( 11408 SELECT *, {annotation_id} AS 'transcript' 11409 FROM ( 11410 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 11411 FROM dataframe_annotation_format 11412 ) 11413 LIMIT 0 11414 ); 11415 """ 11416 self.execute_query(query=query_create_table) 11417 11418 # Free up memory 11419 del dataframe_annotation_format 11420 11421 # Insert data by chromosome 11422 for chrom in unique_chroms["#CHROM"]: 11423 11424 # Log 11425 log.debug(f"Processing #CHROM={chrom}") 11426 11427 # Create dataframe 11428 dataframe_annotation_format = self.get_query_to_df( 11429 f""" {dataframe_annotation_format_base} WHERE "#CHROM" = '{chrom}' """ 11430 ) 11431 11432 # Define a vectorized function to apply explode_annotation_format 11433 vectorized_explode_annotation_format = np.vectorize( 11434 lambda x: explode_annotation_format( 11435 annotation=str(x), 11436 uniquify=uniquify, 11437 output_format="JSON", 11438 prefix="", 11439 header=list(ann_header_desc.values()), 11440 ) 11441 ) 11442 11443 # Assign the exploded annotations back to the dataframe 11444 dataframe_annotation_format[annotation_format_infos] = ( 11445 vectorized_explode_annotation_format( 11446 dataframe_annotation_format[annotation_infos].to_numpy() 11447 ) 11448 ) 11449 11450 # Insert data into tmp table 11451 query_insert_chunk = f""" 11452 INSERT INTO {view_name} 11453 SELECT *, {annotation_id} AS 'transcript' 11454 FROM ( 11455 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 11456 FROM dataframe_annotation_format 11457 ) 11458 """ 11459 self.execute_query(query=query_insert_chunk) 11460 11461 # Free up memory 11462 del dataframe_annotation_format 11463 11464 else: 11465 11466 # Return None 11467 view_name = None 11468 11469 # Remove added columns 11470 for added_column in added_columns: 11471 self.drop_column(column=added_column) 11472 11473 return view_name 11474 11475 def transcript_view_to_variants( 11476 self, 11477 transcripts_table: str = None, 11478 transcripts_column_id: str = None, 11479 transcripts_info_json: str = None, 11480 transcripts_info_field_json: str = None, 11481 transcripts_info_format: str = None, 11482 transcripts_info_field_format: str = None, 11483 param: dict = {}, 11484 ) -> bool: 11485 """ 11486 The `transcript_view_to_variants` function updates a variants table with information from 11487 transcripts in JSON format. 11488 11489 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 11490 table containing the transcripts data. If this parameter is not provided, the function will 11491 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 11492 :type transcripts_table: str 11493 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 11494 column in the `transcripts_table` that contains the unique identifier for each transcript. This 11495 identifier is used to match transcripts with variants in the database 11496 :type transcripts_column_id: str 11497 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 11498 of the column in the variants table where the transcripts information will be stored in JSON 11499 format. This parameter allows you to define the column in the variants table that will hold the 11500 JSON-formatted information about transcripts 11501 :type transcripts_info_json: str 11502 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 11503 specify the field in the VCF header that will contain information about transcripts in JSON 11504 format. This field will be added to the VCF header as an INFO field with the specified name 11505 :type transcripts_info_field_json: str 11506 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 11507 format of the information about transcripts that will be stored in the variants table. This 11508 format can be used to define how the transcript information will be structured or displayed 11509 within the variants table 11510 :type transcripts_info_format: str 11511 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 11512 specify the field in the VCF header that will contain information about transcripts in a 11513 specific format. This field will be added to the VCF header as an INFO field with the specified 11514 name 11515 :type transcripts_info_field_format: str 11516 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 11517 that contains various configuration settings related to transcripts. It is used to provide 11518 default values for certain parameters if they are not explicitly provided when calling the 11519 method. The `param` dictionary can be passed as an argument 11520 :type param: dict 11521 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 11522 if the operation is successful and `False` if certain conditions are not met. 11523 """ 11524 11525 msg_info_prefix = "Start transcripts view to variants annotations" 11526 11527 log.debug(f"{msg_info_prefix}...") 11528 11529 # Default 11530 transcripts_table_default = "transcripts" 11531 transcripts_column_id_default = "transcript" 11532 transcripts_info_json_default = None 11533 transcripts_info_format_default = None 11534 transcripts_info_field_json_default = None 11535 transcripts_info_field_format_default = None 11536 11537 # Param 11538 if not param: 11539 param = self.get_param() 11540 11541 # Transcripts table 11542 if transcripts_table is None: 11543 transcripts_table = param.get("transcripts", {}).get( 11544 "table", transcripts_table_default 11545 ) 11546 11547 # Transcripts column ID 11548 if transcripts_column_id is None: 11549 transcripts_column_id = param.get("transcripts", {}).get( 11550 "column_id", transcripts_column_id_default 11551 ) 11552 11553 # Transcripts info json 11554 if transcripts_info_json is None: 11555 transcripts_info_json = param.get("transcripts", {}).get( 11556 "transcripts_info_json", transcripts_info_json_default 11557 ) 11558 11559 # Transcripts info field JSON 11560 if transcripts_info_field_json is None: 11561 transcripts_info_field_json = param.get("transcripts", {}).get( 11562 "transcripts_info_field_json", transcripts_info_field_json_default 11563 ) 11564 # if transcripts_info_field_json is not None and transcripts_info_json is None: 11565 # transcripts_info_json = transcripts_info_field_json 11566 11567 # Transcripts info format 11568 if transcripts_info_format is None: 11569 transcripts_info_format = param.get("transcripts", {}).get( 11570 "transcripts_info_format", transcripts_info_format_default 11571 ) 11572 11573 # Transcripts info field FORMAT 11574 if transcripts_info_field_format is None: 11575 transcripts_info_field_format = param.get("transcripts", {}).get( 11576 "transcripts_info_field_format", transcripts_info_field_format_default 11577 ) 11578 # if ( 11579 # transcripts_info_field_format is not None 11580 # and transcripts_info_format is None 11581 # ): 11582 # transcripts_info_format = transcripts_info_field_format 11583 11584 # Variants table 11585 table_variants = self.get_table_variants() 11586 11587 # Check info columns param 11588 if ( 11589 transcripts_info_json is None 11590 and transcripts_info_field_json is None 11591 and transcripts_info_format is None 11592 and transcripts_info_field_format is None 11593 ): 11594 return False 11595 11596 # Transcripts infos columns 11597 query_transcripts_infos_columns = f""" 11598 SELECT * 11599 FROM ( 11600 DESCRIBE SELECT * FROM {transcripts_table} 11601 ) 11602 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 11603 """ 11604 transcripts_infos_columns = list( 11605 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 11606 ) 11607 11608 # View results 11609 clause_select = [] 11610 clause_to_json = [] 11611 clause_to_format = [] 11612 for field in transcripts_infos_columns: 11613 # Do not consider INFO field for export into fields 11614 if field not in ["INFO"]: 11615 clause_select.append( 11616 f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """ 11617 ) 11618 clause_to_json.append(f""" '{field}': "{field}" """) 11619 clause_to_format.append(f""" "{field}" """) 11620 11621 # Update 11622 update_set_json = [] 11623 update_set_format = [] 11624 11625 # VCF header 11626 vcf_reader = self.get_header() 11627 11628 # Transcripts to info column in JSON 11629 if transcripts_info_json: 11630 11631 # Create column on variants table 11632 self.add_column( 11633 table_name=table_variants, 11634 column_name=transcripts_info_json, 11635 column_type="JSON", 11636 default_value=None, 11637 drop=False, 11638 ) 11639 11640 # Add header 11641 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 11642 transcripts_info_json, 11643 ".", 11644 "String", 11645 "Transcripts in JSON format", 11646 "unknwon", 11647 "unknwon", 11648 self.code_type_map["String"], 11649 ) 11650 11651 # Add to update 11652 update_set_json.append( 11653 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 11654 ) 11655 11656 # Transcripts to info field in JSON 11657 if transcripts_info_field_json: 11658 11659 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 11660 11661 # Add to update 11662 update_set_json.append( 11663 f""" 11664 INFO = concat( 11665 CASE 11666 WHEN INFO NOT IN ('', '.') 11667 THEN INFO 11668 ELSE '' 11669 END, 11670 CASE 11671 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 11672 THEN concat( 11673 ';{transcripts_info_field_json}=', 11674 t.{transcripts_info_json} 11675 ) 11676 ELSE '' 11677 END 11678 ) 11679 """ 11680 ) 11681 11682 # Add header 11683 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 11684 transcripts_info_field_json, 11685 ".", 11686 "String", 11687 "Transcripts in JSON format", 11688 "unknwon", 11689 "unknwon", 11690 self.code_type_map["String"], 11691 ) 11692 11693 if update_set_json: 11694 11695 # Update query 11696 query_update = f""" 11697 UPDATE {table_variants} 11698 SET {", ".join(update_set_json)} 11699 FROM 11700 ( 11701 SELECT 11702 "#CHROM", POS, REF, ALT, 11703 concat( 11704 '{{', 11705 string_agg( 11706 '"' || "{transcripts_column_id}" || '":' || 11707 to_json(json_output) 11708 ), 11709 '}}' 11710 )::JSON AS {transcripts_info_json} 11711 FROM 11712 ( 11713 SELECT 11714 "#CHROM", POS, REF, ALT, 11715 "{transcripts_column_id}", 11716 to_json( 11717 {{{",".join(clause_to_json)}}} 11718 )::JSON AS json_output 11719 FROM 11720 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11721 WHERE "{transcripts_column_id}" IS NOT NULL 11722 ) 11723 GROUP BY "#CHROM", POS, REF, ALT 11724 ) AS t 11725 WHERE {table_variants}."#CHROM" = t."#CHROM" 11726 AND {table_variants}."POS" = t."POS" 11727 AND {table_variants}."REF" = t."REF" 11728 AND {table_variants}."ALT" = t."ALT" 11729 """ 11730 11731 self.execute_query(query=query_update) 11732 11733 # Transcripts to info column in FORMAT 11734 if transcripts_info_format: 11735 11736 # Create column on variants table 11737 self.add_column( 11738 table_name=table_variants, 11739 column_name=transcripts_info_format, 11740 column_type="VARCHAR", 11741 default_value=None, 11742 drop=False, 11743 ) 11744 11745 # Add header 11746 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 11747 transcripts_info_format, 11748 ".", 11749 "String", 11750 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11751 "unknwon", 11752 "unknwon", 11753 self.code_type_map["String"], 11754 ) 11755 11756 # Add to update 11757 update_set_format.append( 11758 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 11759 ) 11760 11761 else: 11762 11763 # Set variable for internal queries 11764 transcripts_info_format = "transcripts_info_format" 11765 11766 # Transcripts to info field in JSON 11767 if transcripts_info_field_format: 11768 11769 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 11770 11771 # Add to update 11772 update_set_format.append( 11773 f""" 11774 INFO = concat( 11775 CASE 11776 WHEN INFO NOT IN ('', '.') 11777 THEN INFO 11778 ELSE '' 11779 END, 11780 CASE 11781 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 11782 THEN concat( 11783 ';{transcripts_info_field_format}=', 11784 t.{transcripts_info_format} 11785 ) 11786 ELSE '' 11787 END 11788 ) 11789 """ 11790 ) 11791 11792 # Add header 11793 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 11794 transcripts_info_field_format, 11795 ".", 11796 "String", 11797 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11798 "unknwon", 11799 "unknwon", 11800 self.code_type_map["String"], 11801 ) 11802 11803 if update_set_format: 11804 11805 # Update query 11806 query_update = f""" 11807 UPDATE {table_variants} 11808 SET {", ".join(update_set_format)} 11809 FROM 11810 ( 11811 SELECT 11812 "#CHROM", POS, REF, ALT, 11813 string_agg({transcripts_info_format}) AS {transcripts_info_format} 11814 FROM 11815 ( 11816 SELECT 11817 "#CHROM", POS, REF, ALT, 11818 "{transcripts_column_id}", 11819 concat( 11820 "{transcripts_column_id}", 11821 '|', 11822 {", '|', ".join(clause_to_format)} 11823 ) AS {transcripts_info_format} 11824 FROM 11825 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11826 ) 11827 GROUP BY "#CHROM", POS, REF, ALT 11828 ) AS t 11829 WHERE {table_variants}."#CHROM" = t."#CHROM" 11830 AND {table_variants}."POS" = t."POS" 11831 AND {table_variants}."REF" = t."REF" 11832 AND {table_variants}."ALT" = t."ALT" 11833 """ 11834 11835 self.execute_query(query=query_update) 11836 11837 return True 11838 11839 def rename_info_fields( 11840 self, fields_to_rename: dict = None, table: str = None 11841 ) -> dict: 11842 """ 11843 The `rename_info_fields` function renames specified fields in a VCF file header and updates 11844 corresponding INFO fields in the variants table. 11845 11846 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the 11847 mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary 11848 represent the original field names that need to be renamed, and the corresponding values 11849 represent the new names to which the fields should be 11850 :type fields_to_rename: dict 11851 :param table: The `table` parameter in the `rename_info_fields` function represents the name of 11852 the table in which the variants data is stored. This table contains information about genetic 11853 variants, and the function updates the corresponding INFO fields in this table when renaming 11854 specified fields in the VCF file header 11855 :type table: str 11856 :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains 11857 the original field names as keys and their corresponding new names (or None if the field was 11858 removed) as values after renaming or removing specified fields in a VCF file header and updating 11859 corresponding INFO fields in the variants table. 11860 """ 11861 11862 # Init 11863 fields_renamed = {} 11864 config = self.get_config() 11865 access = config.get("access") 11866 11867 if table is None: 11868 table = self.get_table_variants() 11869 11870 # regexp replace fonction 11871 regex_replace_dict = {} 11872 regex_replace_nb = 0 11873 regex_replace_partition = 125 11874 regex_replace = "concat(INFO, ';')" # Add ';' to reduce regexp comlexity 11875 11876 if fields_to_rename is not None and access not in ["RO"]: 11877 11878 log.info("Rename or remove fields...") 11879 11880 # Header 11881 header = self.get_header() 11882 11883 for field_to_rename, field_renamed in fields_to_rename.items(): 11884 11885 if field_to_rename in header.infos: 11886 11887 # Rename header 11888 if field_renamed is not None: 11889 header.infos[field_renamed] = vcf.parser._Info( 11890 field_renamed, 11891 header.infos[field_to_rename].num, 11892 header.infos[field_to_rename].type, 11893 header.infos[field_to_rename].desc, 11894 header.infos[field_to_rename].source, 11895 header.infos[field_to_rename].version, 11896 header.infos[field_to_rename].type_code, 11897 ) 11898 del header.infos[field_to_rename] 11899 11900 # Rename INFO patterns 11901 field_pattern = rf"(^|;)({field_to_rename})(=[^;]*)?;" 11902 if field_renamed is not None: 11903 field_renamed_pattern = rf"\1{field_renamed}\3;" 11904 else: 11905 field_renamed_pattern = r"\1" 11906 11907 # regexp replace 11908 regex_replace_nb += 1 11909 regex_replace_key = math.floor( 11910 regex_replace_nb / regex_replace_partition 11911 ) 11912 if (regex_replace_nb % regex_replace_partition) == 0: 11913 regex_replace = "concat(INFO, ';')" 11914 regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')" 11915 regex_replace_dict[regex_replace_key] = regex_replace 11916 11917 # Return 11918 fields_renamed[field_to_rename] = field_renamed 11919 11920 # Log 11921 if field_renamed is not None: 11922 log.info( 11923 f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'" 11924 ) 11925 else: 11926 log.info( 11927 f"Rename or remove fields - field '{field_to_rename}' removed" 11928 ) 11929 11930 else: 11931 11932 log.warning( 11933 f"Rename or remove fields - field '{field_to_rename}' not in header" 11934 ) 11935 11936 # Rename INFO 11937 for regex_replace_key, regex_replace in regex_replace_dict.items(): 11938 log.info( 11939 f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]..." 11940 ) 11941 query = f""" 11942 UPDATE {table} 11943 SET 11944 INFO = regexp_replace({regex_replace}, ';$', '') 11945 """ 11946 log.debug(f"query={query}") 11947 self.execute_query(query=query) 11948 11949 return fields_renamed 11950 11951 def calculation_rename_info_fields( 11952 self, 11953 fields_to_rename: dict = None, 11954 table: str = None, 11955 operation_name: str = "RENAME_INFO_FIELDS", 11956 ) -> None: 11957 """ 11958 The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates 11959 fields to rename and table if provided, and then calls another function to rename the fields. 11960 11961 :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be 11962 renamed in a table. Each key-value pair in the dictionary represents the original field name as 11963 the key and the new field name as the value 11964 :type fields_to_rename: dict 11965 :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to 11966 specify the name of the table for which the fields are to be renamed. It is a string type 11967 parameter 11968 :type table: str 11969 :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields` 11970 method is a string that specifies the name of the operation being performed. In this context, it 11971 is used as a default value for the operation name if not explicitly provided when calling the 11972 function, defaults to RENAME_INFO_FIELDS 11973 :type operation_name: str (optional) 11974 """ 11975 11976 # Param 11977 param = self.get_param() 11978 11979 # Get param fields to rename 11980 param_fields_to_rename = ( 11981 param.get("calculation", {}) 11982 .get("calculations", {}) 11983 .get(operation_name, {}) 11984 .get("fields_to_rename", None) 11985 ) 11986 11987 # Get param table 11988 param_table = ( 11989 param.get("calculation", {}) 11990 .get("calculations", {}) 11991 .get(operation_name, {}) 11992 .get("table", None) 11993 ) 11994 11995 # Init fields_to_rename 11996 if fields_to_rename is None: 11997 fields_to_rename = param_fields_to_rename 11998 11999 # Init table 12000 if table is None: 12001 table = param_table 12002 12003 renamed_fields = self.rename_info_fields( 12004 fields_to_rename=fields_to_rename, table=table 12005 ) 12006 12007 log.debug(f"renamed_fields:{renamed_fields}")
37class Variants: 38 39 def __init__( 40 self, 41 conn=None, 42 input: str = None, 43 output: str = None, 44 config: dict = {}, 45 param: dict = {}, 46 load: bool = False, 47 ) -> None: 48 """ 49 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 50 header 51 52 :param conn: the connection to the database 53 :param input: the input file 54 :param output: the output file 55 :param config: a dictionary containing the configuration of the model 56 :param param: a dictionary containing the parameters of the model 57 """ 58 59 # Init variables 60 self.init_variables() 61 62 # Input 63 self.set_input(input) 64 65 # Config 66 self.set_config(config) 67 68 # Param 69 self.set_param(param) 70 71 # Output 72 self.set_output(output) 73 74 # connexion 75 self.set_connexion(conn) 76 77 # Header 78 self.set_header() 79 80 # Samples 81 self.set_samples() 82 83 # Load data 84 if load: 85 self.load_data() 86 87 def set_samples(self, samples: list = None) -> list: 88 """ 89 The function `set_samples` sets the samples attribute of an object to a provided list or 90 retrieves it from a parameter dictionary. 91 92 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 93 input and sets the `samples` attribute of the class to the provided list. If no samples are 94 provided, it tries to get the samples from the class's parameters using the `get_param` method 95 :type samples: list 96 :return: The `samples` list is being returned. 97 """ 98 99 if not samples: 100 samples = self.get_param().get("samples", {}).get("list", None) 101 102 self.samples = samples 103 104 return samples 105 106 def get_samples(self) -> list: 107 """ 108 This function returns a list of samples. 109 :return: The `get_samples` method is returning the `samples` attribute of the object. 110 """ 111 112 return self.samples 113 114 def get_samples_check(self) -> bool: 115 """ 116 This function returns the value of the "check" key within the "samples" dictionary retrieved 117 from the parameters. 118 :return: The method `get_samples_check` is returning the value of the key "check" inside the 119 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 120 method. If the key "check" is not found, it will return `False`. 121 """ 122 123 return self.get_param().get("samples", {}).get("check", True) 124 125 def set_input(self, input: str = None) -> None: 126 """ 127 The function `set_input` takes a file name as input, extracts the name and extension, and sets 128 attributes in the class accordingly. 129 130 :param input: The `set_input` method in the provided code snippet is used to set attributes 131 related to the input file. Here's a breakdown of the parameters and their usage in the method: 132 :type input: str 133 """ 134 135 if input and not isinstance(input, str): 136 try: 137 self.input = input.name 138 except: 139 log.error(f"Input file '{input} in bad format") 140 raise ValueError(f"Input file '{input} in bad format") 141 else: 142 self.input = input 143 144 # Input format 145 if input: 146 input_name, input_extension = os.path.splitext(self.input) 147 self.input_name = input_name 148 self.input_extension = input_extension 149 self.input_format = self.input_extension.replace(".", "") 150 151 def set_config(self, config: dict) -> None: 152 """ 153 The set_config function takes a config object and assigns it as the configuration object for the 154 class. 155 156 :param config: The `config` parameter in the `set_config` function is a dictionary object that 157 contains configuration settings for the class. When you call the `set_config` function with a 158 dictionary object as the argument, it will set that dictionary as the configuration object for 159 the class 160 :type config: dict 161 """ 162 163 self.config = config 164 165 def set_param(self, param: dict) -> None: 166 """ 167 This function sets a parameter object for the class based on the input dictionary. 168 169 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 170 as the `param` attribute of the class instance 171 :type param: dict 172 """ 173 174 self.param = param 175 176 def init_variables(self) -> None: 177 """ 178 This function initializes the variables that will be used in the rest of the class 179 """ 180 181 self.prefix = "howard" 182 self.table_variants = "variants" 183 self.dataframe = None 184 185 self.comparison_map = { 186 "gt": ">", 187 "gte": ">=", 188 "lt": "<", 189 "lte": "<=", 190 "equals": "=", 191 "contains": "SIMILAR TO", 192 } 193 194 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 195 196 self.code_type_map_to_sql = { 197 "Integer": "INTEGER", 198 "String": "VARCHAR", 199 "Float": "FLOAT", 200 "Flag": "VARCHAR", 201 } 202 203 self.index_additionnal_fields = [] 204 205 def get_indexing(self) -> bool: 206 """ 207 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 208 returns False. 209 :return: The value of the indexing parameter. 210 """ 211 212 return self.get_param().get("indexing", False) 213 214 def get_connexion_config(self) -> dict: 215 """ 216 The function `get_connexion_config` returns a dictionary containing the configuration for a 217 connection, including the number of threads and memory limit. 218 :return: a dictionary containing the configuration for the Connexion library. 219 """ 220 221 # config 222 config = self.get_config() 223 224 # Connexion config 225 connexion_config = {} 226 threads = self.get_threads() 227 228 # Threads 229 if threads: 230 connexion_config["threads"] = threads 231 232 # Memory 233 # if config.get("memory", None): 234 # connexion_config["memory_limit"] = config.get("memory") 235 if self.get_memory(): 236 connexion_config["memory_limit"] = self.get_memory() 237 238 # Temporary directory 239 if config.get("tmp", None): 240 connexion_config["temp_directory"] = config.get("tmp") 241 242 # Access 243 if config.get("access", None): 244 access = config.get("access") 245 if access in ["RO"]: 246 access = "READ_ONLY" 247 elif access in ["RW"]: 248 access = "READ_WRITE" 249 connexion_db = self.get_connexion_db() 250 if connexion_db in ":memory:": 251 access = "READ_WRITE" 252 connexion_config["access_mode"] = access 253 254 return connexion_config 255 256 def get_duckdb_settings(self) -> dict: 257 """ 258 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 259 string. 260 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 261 """ 262 263 # config 264 config = self.get_config() 265 266 # duckdb settings 267 duckdb_settings_dict = {} 268 if config.get("duckdb_settings", None): 269 duckdb_settings = config.get("duckdb_settings") 270 duckdb_settings = full_path(duckdb_settings) 271 # duckdb setting is a file 272 if os.path.exists(duckdb_settings): 273 with open(duckdb_settings) as json_file: 274 duckdb_settings_dict = yaml.safe_load(json_file) 275 # duckdb settings is a string 276 else: 277 duckdb_settings_dict = json.loads(duckdb_settings) 278 279 return duckdb_settings_dict 280 281 def set_connexion_db(self) -> str: 282 """ 283 The function `set_connexion_db` returns the appropriate database connection string based on the 284 input format and connection type. 285 :return: the value of the variable `connexion_db`. 286 """ 287 288 # Default connexion db 289 default_connexion_db = ":memory:" 290 291 # Find connexion db 292 if self.get_input_format() in ["db", "duckdb"]: 293 connexion_db = self.get_input() 294 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 295 connexion_db = default_connexion_db 296 elif self.get_connexion_type() in ["tmpfile"]: 297 tmp_name = tempfile.mkdtemp( 298 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 299 ) 300 connexion_db = f"{tmp_name}/tmp.db" 301 elif self.get_connexion_type() != "": 302 connexion_db = self.get_connexion_type() 303 else: 304 connexion_db = default_connexion_db 305 306 # Set connexion db 307 self.connexion_db = connexion_db 308 309 return connexion_db 310 311 def set_connexion(self, conn) -> None: 312 """ 313 The function `set_connexion` creates a connection to a database, with options for different 314 database formats and settings. 315 316 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 317 database. If a connection is not provided, a new connection to an in-memory database is created. 318 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 319 sqlite 320 """ 321 322 # Connexion db 323 connexion_db = self.set_connexion_db() 324 325 # Connexion config 326 connexion_config = self.get_connexion_config() 327 328 # Connexion format 329 connexion_format = self.get_config().get("connexion_format", "duckdb") 330 # Set connexion format 331 self.connexion_format = connexion_format 332 333 # Connexion 334 if not conn: 335 if connexion_format in ["duckdb"]: 336 conn = duckdb.connect(connexion_db, config=connexion_config) 337 # duckDB settings 338 duckdb_settings = self.get_duckdb_settings() 339 if duckdb_settings: 340 for setting in duckdb_settings: 341 setting_value = duckdb_settings.get(setting) 342 if isinstance(setting_value, str): 343 setting_value = f"'{setting_value}'" 344 conn.execute(f"PRAGMA {setting}={setting_value};") 345 elif connexion_format in ["sqlite"]: 346 conn = sqlite3.connect(connexion_db) 347 348 # Set connexion 349 self.conn = conn 350 351 # Log 352 log.debug(f"connexion_format: {connexion_format}") 353 log.debug(f"connexion_db: {connexion_db}") 354 log.debug(f"connexion config: {connexion_config}") 355 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 356 357 def set_output(self, output: str = None) -> None: 358 """ 359 The `set_output` function in Python sets the output file based on the input or a specified key 360 in the config file, extracting the output name, extension, and format. 361 362 :param output: The `output` parameter in the `set_output` method is used to specify the name of 363 the output file. If the config file has an 'output' key, the method sets the output to the value 364 of that key. If no output is provided, it sets the output to `None` 365 :type output: str 366 """ 367 368 if output and not isinstance(output, str): 369 self.output = output.name 370 else: 371 self.output = output 372 373 # Output format 374 if self.output: 375 output_name, output_extension = os.path.splitext(self.output) 376 self.output_name = output_name 377 self.output_extension = output_extension 378 self.output_format = self.output_extension.replace(".", "") 379 else: 380 self.output_name = None 381 self.output_extension = None 382 self.output_format = None 383 384 def set_header(self) -> None: 385 """ 386 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 387 """ 388 389 input_file = self.get_input() 390 default_header_list = [ 391 "##fileformat=VCFv4.2", 392 "#CHROM POS ID REF ALT QUAL FILTER INFO", 393 ] 394 395 # Full path 396 input_file = full_path(input_file) 397 398 if input_file: 399 400 input_format = self.get_input_format() 401 input_compressed = self.get_input_compressed() 402 config = self.get_config() 403 header_list = default_header_list 404 if input_format in [ 405 "vcf", 406 "hdr", 407 "tsv", 408 "csv", 409 "psv", 410 "parquet", 411 "db", 412 "duckdb", 413 ]: 414 # header provided in param 415 if config.get("header_file", None): 416 with open(config.get("header_file"), "rt") as f: 417 header_list = self.read_vcf_header(f) 418 # within a vcf file format (header within input file itsself) 419 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 420 # within a compressed vcf file format (.vcf.gz) 421 if input_compressed: 422 with bgzf.open(input_file, "rt") as f: 423 header_list = self.read_vcf_header(f) 424 # within an uncompressed vcf file format (.vcf) 425 else: 426 with open(input_file, "rt") as f: 427 header_list = self.read_vcf_header(f) 428 # header provided in default external file .hdr 429 elif os.path.exists((input_file + ".hdr")): 430 with open(input_file + ".hdr", "rt") as f: 431 header_list = self.read_vcf_header(f) 432 else: 433 try: # Try to get header info fields and file columns 434 435 with tempfile.TemporaryDirectory() as tmpdir: 436 437 # Create database 438 db_for_header = Database(database=input_file) 439 440 # Get header columns for infos fields 441 db_header_from_columns = ( 442 db_for_header.get_header_from_columns() 443 ) 444 445 # Get real columns in the file 446 db_header_columns = db_for_header.get_columns() 447 448 # Write header file 449 header_file_tmp = os.path.join(tmpdir, "header") 450 f = open(header_file_tmp, "w") 451 vcf.Writer(f, db_header_from_columns) 452 f.close() 453 454 # Replace #CHROM line with rel columns 455 header_list = db_for_header.read_header_file( 456 header_file=header_file_tmp 457 ) 458 header_list[-1] = "\t".join(db_header_columns) 459 460 except: 461 462 log.warning( 463 f"No header for file {input_file}. Set as default VCF header" 464 ) 465 header_list = default_header_list 466 467 else: # try for unknown format ? 468 469 log.error(f"Input file format '{input_format}' not available") 470 raise ValueError(f"Input file format '{input_format}' not available") 471 472 if not header_list: 473 header_list = default_header_list 474 475 # header as list 476 self.header_list = header_list 477 478 # header as VCF object 479 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 480 481 else: 482 483 self.header_list = None 484 self.header_vcf = None 485 486 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 487 """ 488 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 489 DataFrame based on the connection format. 490 491 :param query: The `query` parameter in the `get_query_to_df` function is a string that 492 represents the SQL query you want to execute. This query will be used to fetch data from a 493 database and convert it into a pandas DataFrame 494 :type query: str 495 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 496 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 497 function will only fetch up to that number of rows from the database query result. If no limit 498 is specified, 499 :type limit: int 500 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 501 """ 502 503 # Connexion format 504 connexion_format = self.get_connexion_format() 505 506 # Limit in query 507 if limit: 508 pd.set_option("display.max_rows", limit) 509 if connexion_format in ["duckdb"]: 510 df = ( 511 self.conn.execute(query) 512 .fetch_record_batch(limit) 513 .read_next_batch() 514 .to_pandas() 515 ) 516 elif connexion_format in ["sqlite"]: 517 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 518 519 # Full query 520 else: 521 if connexion_format in ["duckdb"]: 522 df = self.conn.execute(query).df() 523 elif connexion_format in ["sqlite"]: 524 df = pd.read_sql_query(query, self.conn) 525 526 return df 527 528 def get_overview(self) -> None: 529 """ 530 The function prints the input, output, config, and dataframe of the current object 531 """ 532 table_variants_from = self.get_table_variants(clause="from") 533 sql_columns = self.get_header_columns_as_sql() 534 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 535 df = self.get_query_to_df(sql_query_export) 536 log.info( 537 "Input: " 538 + str(self.get_input()) 539 + " [" 540 + str(str(self.get_input_format())) 541 + "]" 542 ) 543 log.info( 544 "Output: " 545 + str(self.get_output()) 546 + " [" 547 + str(str(self.get_output_format())) 548 + "]" 549 ) 550 log.info("Config: ") 551 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 552 "\n" 553 ): 554 log.info("\t" + str(d)) 555 log.info("Param: ") 556 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 557 "\n" 558 ): 559 log.info("\t" + str(d)) 560 log.info("Sample list: " + str(self.get_header_sample_list())) 561 log.info("Dataframe: ") 562 for d in str(df).split("\n"): 563 log.info("\t" + str(d)) 564 565 # garbage collector 566 del df 567 gc.collect() 568 569 return None 570 571 def get_stats(self) -> dict: 572 """ 573 The `get_stats` function calculates and returns various statistics of the current object, 574 including information about the input file, variants, samples, header fields, quality, and 575 SNVs/InDels. 576 :return: a dictionary containing various statistics of the current object. The dictionary has 577 the following structure: 578 """ 579 580 # Log 581 log.info(f"Stats Calculation...") 582 583 # table varaints 584 table_variants_from = self.get_table_variants() 585 586 # stats dict 587 stats = {"Infos": {}} 588 589 ### File 590 input_file = self.get_input() 591 stats["Infos"]["Input file"] = input_file 592 593 # Header 594 header_infos = self.get_header().infos 595 header_formats = self.get_header().formats 596 header_infos_list = list(header_infos) 597 header_formats_list = list(header_formats) 598 599 ### Variants 600 601 stats["Variants"] = {} 602 603 # Variants by chr 604 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 605 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 606 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 607 by=["CHROM"], kind="quicksort" 608 ) 609 610 # Total number of variants 611 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 612 613 # Calculate percentage 614 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 615 lambda x: (x / nb_of_variants) 616 ) 617 618 stats["Variants"]["Number of variants by chromosome"] = ( 619 nb_of_variants_by_chrom.to_dict(orient="index") 620 ) 621 622 stats["Infos"]["Number of variants"] = int(nb_of_variants) 623 624 ### Samples 625 626 # Init 627 samples = {} 628 nb_of_samples = 0 629 630 # Check Samples 631 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 632 log.debug(f"Check samples...") 633 for sample in self.get_header_sample_list(): 634 sql_query_samples = f""" 635 SELECT '{sample}' as sample, 636 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 637 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 638 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 639 FROM {table_variants_from} 640 WHERE ( 641 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 642 AND 643 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 644 ) 645 GROUP BY genotype 646 """ 647 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 648 sample_genotype_count = sql_query_genotype_df["count"].sum() 649 if len(sql_query_genotype_df): 650 nb_of_samples += 1 651 samples[f"{sample} - {sample_genotype_count} variants"] = ( 652 sql_query_genotype_df.to_dict(orient="index") 653 ) 654 655 stats["Samples"] = samples 656 stats["Infos"]["Number of samples"] = nb_of_samples 657 658 # # 659 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 660 # stats["Infos"]["Number of samples"] = nb_of_samples 661 # elif nb_of_samples: 662 # stats["Infos"]["Number of samples"] = "not a VCF format" 663 664 ### INFO and FORMAT fields 665 header_types_df = {} 666 header_types_list = { 667 "List of INFO fields": header_infos, 668 "List of FORMAT fields": header_formats, 669 } 670 i = 0 671 for header_type in header_types_list: 672 673 header_type_infos = header_types_list.get(header_type) 674 header_infos_dict = {} 675 676 for info in header_type_infos: 677 678 i += 1 679 header_infos_dict[i] = {} 680 681 # ID 682 header_infos_dict[i]["id"] = info 683 684 # num 685 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 686 if header_type_infos[info].num in genotype_map.keys(): 687 header_infos_dict[i]["Number"] = genotype_map.get( 688 header_type_infos[info].num 689 ) 690 else: 691 header_infos_dict[i]["Number"] = header_type_infos[info].num 692 693 # type 694 if header_type_infos[info].type: 695 header_infos_dict[i]["Type"] = header_type_infos[info].type 696 else: 697 header_infos_dict[i]["Type"] = "." 698 699 # desc 700 if header_type_infos[info].desc != None: 701 header_infos_dict[i]["Description"] = header_type_infos[info].desc 702 else: 703 header_infos_dict[i]["Description"] = "" 704 705 if len(header_infos_dict): 706 header_types_df[header_type] = pd.DataFrame.from_dict( 707 header_infos_dict, orient="index" 708 ).to_dict(orient="index") 709 710 # Stats 711 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 712 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 713 stats["Header"] = header_types_df 714 715 ### QUAL 716 if "QUAL" in self.get_header_columns(): 717 sql_query_qual = f""" 718 SELECT 719 avg(CAST(QUAL AS INTEGER)) AS Average, 720 min(CAST(QUAL AS INTEGER)) AS Minimum, 721 max(CAST(QUAL AS INTEGER)) AS Maximum, 722 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 723 median(CAST(QUAL AS INTEGER)) AS Median, 724 variance(CAST(QUAL AS INTEGER)) AS Variance 725 FROM {table_variants_from} 726 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 727 """ 728 729 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 730 stats["Quality"] = {"Stats": qual} 731 732 ### SNV and InDel 733 734 sql_query_snv = f""" 735 736 SELECT Type, count FROM ( 737 738 SELECT 739 'Total' AS Type, 740 count(*) AS count 741 FROM {table_variants_from} 742 743 UNION 744 745 SELECT 746 'MNV' AS Type, 747 count(*) AS count 748 FROM {table_variants_from} 749 WHERE len(REF) > 1 AND len(ALT) > 1 750 AND len(REF) = len(ALT) 751 752 UNION 753 754 SELECT 755 'InDel' AS Type, 756 count(*) AS count 757 FROM {table_variants_from} 758 WHERE len(REF) > 1 OR len(ALT) > 1 759 AND len(REF) != len(ALT) 760 761 UNION 762 763 SELECT 764 'SNV' AS Type, 765 count(*) AS count 766 FROM {table_variants_from} 767 WHERE len(REF) = 1 AND len(ALT) = 1 768 769 ) 770 771 ORDER BY count DESC 772 773 """ 774 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 775 776 sql_query_snv_substitution = f""" 777 SELECT 778 concat(REF, '>', ALT) AS 'Substitution', 779 count(*) AS count 780 FROM {table_variants_from} 781 WHERE len(REF) = 1 AND len(ALT) = 1 782 GROUP BY REF, ALT 783 ORDER BY count(*) DESC 784 """ 785 snv_substitution = ( 786 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 787 ) 788 stats["Variants"]["Counts"] = snv_indel 789 stats["Variants"]["Substitutions"] = snv_substitution 790 791 return stats 792 793 def stats_to_file(self, file: str = None) -> str: 794 """ 795 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 796 into a JSON object, and writes the JSON object to the specified file. 797 798 :param file: The `file` parameter is a string that represents the file path where the JSON data 799 will be written 800 :type file: str 801 :return: the name of the file that was written to. 802 """ 803 804 # Get stats 805 stats = self.get_stats() 806 807 # Serializing json 808 json_object = json.dumps(stats, indent=4) 809 810 # Writing to sample.json 811 with open(file, "w") as outfile: 812 outfile.write(json_object) 813 814 return file 815 816 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 817 """ 818 The `print_stats` function generates a markdown file and prints the statistics contained in a 819 JSON file in a formatted manner. 820 821 :param output_file: The `output_file` parameter is a string that specifies the path and filename 822 of the output file where the stats will be printed in Markdown format. If no `output_file` is 823 provided, a temporary directory will be created and the stats will be saved in a file named 824 "stats.md" within that 825 :type output_file: str 826 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 827 file where the statistics will be saved. If no value is provided, a temporary directory will be 828 created and a default file name "stats.json" will be used 829 :type json_file: str 830 :return: The function `print_stats` does not return any value. It has a return type annotation 831 of `None`. 832 """ 833 834 # Full path 835 output_file = full_path(output_file) 836 json_file = full_path(json_file) 837 838 with tempfile.TemporaryDirectory() as tmpdir: 839 840 # Files 841 if not output_file: 842 output_file = os.path.join(tmpdir, "stats.md") 843 if not json_file: 844 json_file = os.path.join(tmpdir, "stats.json") 845 846 # Create folders 847 if not os.path.exists(os.path.dirname(output_file)): 848 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 849 if not os.path.exists(os.path.dirname(json_file)): 850 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 851 852 # Create stats JSON file 853 stats_file = self.stats_to_file(file=json_file) 854 855 # Print stats file 856 with open(stats_file) as f: 857 stats = yaml.safe_load(f) 858 859 # Output 860 output_title = [] 861 output_index = [] 862 output = [] 863 864 # Title 865 output_title.append("# HOWARD Stats") 866 867 # Index 868 output_index.append("## Index") 869 870 # Process sections 871 for section in stats: 872 infos = stats.get(section) 873 section_link = "#" + section.lower().replace(" ", "-") 874 output.append(f"## {section}") 875 output_index.append(f"- [{section}]({section_link})") 876 877 if len(infos): 878 for info in infos: 879 try: 880 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 881 is_df = True 882 except: 883 try: 884 df = pd.DataFrame.from_dict( 885 json.loads((infos.get(info))), orient="index" 886 ) 887 is_df = True 888 except: 889 is_df = False 890 if is_df: 891 output.append(f"### {info}") 892 info_link = "#" + info.lower().replace(" ", "-") 893 output_index.append(f" - [{info}]({info_link})") 894 output.append(f"{df.to_markdown(index=False)}") 895 else: 896 output.append(f"- {info}: {infos.get(info)}") 897 else: 898 output.append(f"NA") 899 900 # Write stats in markdown file 901 with open(output_file, "w") as fp: 902 for item in output_title: 903 fp.write("%s\n" % item) 904 for item in output_index: 905 fp.write("%s\n" % item) 906 for item in output: 907 fp.write("%s\n" % item) 908 909 # Output stats in markdown 910 print("") 911 print("\n\n".join(output_title)) 912 print("") 913 print("\n\n".join(output)) 914 print("") 915 916 return None 917 918 def get_input(self) -> str: 919 """ 920 It returns the value of the input variable. 921 :return: The input is being returned. 922 """ 923 return self.input 924 925 def get_input_format(self, input_file: str = None) -> str: 926 """ 927 This function returns the format of the input variable, either from the provided input file or 928 by prompting for input. 929 930 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 931 represents the file path of the input file. If no `input_file` is provided when calling the 932 method, it will default to `None` 933 :type input_file: str 934 :return: The format of the input variable is being returned. 935 """ 936 937 if not input_file: 938 input_file = self.get_input() 939 input_format = get_file_format(input_file) 940 return input_format 941 942 def get_input_compressed(self, input_file: str = None) -> str: 943 """ 944 The function `get_input_compressed` returns the format of the input variable after compressing 945 it. 946 947 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 948 that represents the file path of the input file. If no `input_file` is provided when calling the 949 method, it will default to `None` and the method will then call `self.get_input()` to 950 :type input_file: str 951 :return: The function `get_input_compressed` returns the compressed format of the input 952 variable. 953 """ 954 955 if not input_file: 956 input_file = self.get_input() 957 input_compressed = get_file_compressed(input_file) 958 return input_compressed 959 960 def get_output(self) -> str: 961 """ 962 It returns the output of the neuron. 963 :return: The output of the neural network. 964 """ 965 966 return self.output 967 968 def get_output_format(self, output_file: str = None) -> str: 969 """ 970 The function `get_output_format` returns the format of the input variable or the output file if 971 provided. 972 973 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 974 that represents the file path of the output file. If no `output_file` is provided when calling 975 the method, it will default to the output obtained from the `get_output` method of the class 976 instance. The 977 :type output_file: str 978 :return: The format of the input variable is being returned. 979 """ 980 981 if not output_file: 982 output_file = self.get_output() 983 output_format = get_file_format(output_file) 984 985 return output_format 986 987 def get_config(self) -> dict: 988 """ 989 It returns the config 990 :return: The config variable is being returned. 991 """ 992 return self.config 993 994 def get_param(self) -> dict: 995 """ 996 It returns the param 997 :return: The param variable is being returned. 998 """ 999 return self.param 1000 1001 def get_connexion_db(self) -> str: 1002 """ 1003 It returns the connexion_db attribute of the object 1004 :return: The connexion_db is being returned. 1005 """ 1006 return self.connexion_db 1007 1008 def get_prefix(self) -> str: 1009 """ 1010 It returns the prefix of the object. 1011 :return: The prefix is being returned. 1012 """ 1013 return self.prefix 1014 1015 def get_table_variants(self, clause: str = "select") -> str: 1016 """ 1017 This function returns the table_variants attribute of the object 1018 1019 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1020 defaults to select (optional) 1021 :return: The table_variants attribute of the object. 1022 """ 1023 1024 # Access 1025 access = self.get_config().get("access", None) 1026 1027 # Clauses "select", "where", "update" 1028 if clause in ["select", "where", "update"]: 1029 table_variants = self.table_variants 1030 # Clause "from" 1031 elif clause in ["from"]: 1032 # For Read Only 1033 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1034 input_file = self.get_input() 1035 table_variants = f"'{input_file}' as variants" 1036 # For Read Write 1037 else: 1038 table_variants = f"{self.table_variants} as variants" 1039 else: 1040 table_variants = self.table_variants 1041 return table_variants 1042 1043 def get_tmp_dir(self) -> str: 1044 """ 1045 The function `get_tmp_dir` returns the temporary directory path based on configuration 1046 parameters or a default path. 1047 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1048 configuration, parameters, and a default value of "/tmp". 1049 """ 1050 1051 return get_tmp( 1052 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1053 ) 1054 1055 def get_connexion_type(self) -> str: 1056 """ 1057 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1058 1059 :return: The connexion type is being returned. 1060 """ 1061 return self.get_config().get("connexion_type", "memory") 1062 1063 def get_connexion(self): 1064 """ 1065 It returns the connection object 1066 1067 :return: The connection object. 1068 """ 1069 return self.conn 1070 1071 def close_connexion(self) -> None: 1072 """ 1073 This function closes the connection to the database. 1074 :return: The connection is being closed. 1075 """ 1076 return self.conn.close() 1077 1078 def get_header(self, type: str = "vcf"): 1079 """ 1080 This function returns the header of the VCF file as a list of strings 1081 1082 :param type: the type of header you want to get, defaults to vcf (optional) 1083 :return: The header of the vcf file. 1084 """ 1085 1086 if self.header_vcf: 1087 if type == "vcf": 1088 return self.header_vcf 1089 elif type == "list": 1090 return self.header_list 1091 else: 1092 if type == "vcf": 1093 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1094 return header 1095 elif type == "list": 1096 return vcf_required 1097 1098 def get_header_infos_list(self) -> list: 1099 """ 1100 This function retrieves a list of information fields from the header. 1101 :return: A list of information fields from the header. 1102 """ 1103 1104 # Init 1105 infos_list = [] 1106 1107 for field in self.get_header().infos: 1108 infos_list.append(field) 1109 1110 return infos_list 1111 1112 def get_header_length(self, file: str = None) -> int: 1113 """ 1114 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1115 line. 1116 1117 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1118 header file. If this argument is provided, the function will read the header from the specified 1119 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1120 :type file: str 1121 :return: the length of the header list, excluding the #CHROM line. 1122 """ 1123 1124 if file: 1125 return len(self.read_vcf_header_file(file=file)) - 1 1126 elif self.get_header(type="list"): 1127 return len(self.get_header(type="list")) - 1 1128 else: 1129 return 0 1130 1131 def get_header_columns(self) -> str: 1132 """ 1133 This function returns the header list of a VCF 1134 1135 :return: The length of the header list. 1136 """ 1137 if self.get_header(): 1138 return self.get_header(type="list")[-1] 1139 else: 1140 return "" 1141 1142 def get_header_columns_as_list(self) -> list: 1143 """ 1144 This function returns the header list of a VCF 1145 1146 :return: The length of the header list. 1147 """ 1148 if self.get_header(): 1149 return self.get_header_columns().strip().split("\t") 1150 else: 1151 return [] 1152 1153 def get_header_columns_as_sql(self) -> str: 1154 """ 1155 This function retruns header length (without #CHROM line) 1156 1157 :return: The length of the header list. 1158 """ 1159 sql_column_list = [] 1160 for col in self.get_header_columns_as_list(): 1161 sql_column_list.append(f'"{col}"') 1162 return ",".join(sql_column_list) 1163 1164 def get_header_sample_list( 1165 self, check: bool = False, samples: list = None, samples_force: bool = False 1166 ) -> list: 1167 """ 1168 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1169 checking and filtering based on input parameters. 1170 1171 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1172 parameter that determines whether to check if the samples in the list are properly defined as 1173 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1174 list is defined as a, defaults to False 1175 :type check: bool (optional) 1176 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1177 allows you to specify a subset of samples from the header. If you provide a list of sample 1178 names, the function will check if each sample is defined in the header. If a sample is not found 1179 in the 1180 :type samples: list 1181 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1182 a boolean parameter that determines whether to force the function to return the sample list 1183 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1184 function will return the sample list without performing, defaults to False 1185 :type samples_force: bool (optional) 1186 :return: The function `get_header_sample_list` returns a list of samples based on the input 1187 parameters and conditions specified in the function. 1188 """ 1189 1190 # Init 1191 samples_list = [] 1192 1193 if samples is None: 1194 samples_list = self.header_vcf.samples 1195 else: 1196 samples_checked = [] 1197 for sample in samples: 1198 if sample in self.header_vcf.samples: 1199 samples_checked.append(sample) 1200 else: 1201 log.warning(f"Sample '{sample}' not defined in header") 1202 samples_list = samples_checked 1203 1204 # Force sample list without checking if is_genotype_column 1205 if samples_force: 1206 log.warning(f"Samples {samples_list} not checked if genotypes") 1207 return samples_list 1208 1209 if check: 1210 samples_checked = [] 1211 for sample in samples_list: 1212 if self.is_genotype_column(column=sample): 1213 samples_checked.append(sample) 1214 else: 1215 log.warning( 1216 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1217 ) 1218 samples_list = samples_checked 1219 1220 # Return samples list 1221 return samples_list 1222 1223 def is_genotype_column(self, column: str = None) -> bool: 1224 """ 1225 This function checks if a given column is a genotype column in a database. 1226 1227 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1228 represents the column name in a database table. This method checks if the specified column is a 1229 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1230 method of 1231 :type column: str 1232 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1233 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1234 column name and returns the result. If the `column` parameter is None, it returns False. 1235 """ 1236 1237 if column is not None: 1238 return Database(database=self.get_input()).is_genotype_column(column=column) 1239 else: 1240 return False 1241 1242 def get_verbose(self) -> bool: 1243 """ 1244 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1245 exist 1246 1247 :return: The value of the key "verbose" in the config dictionary. 1248 """ 1249 return self.get_config().get("verbose", False) 1250 1251 def get_connexion_format(self) -> str: 1252 """ 1253 It returns the connexion format of the object. 1254 :return: The connexion_format is being returned. 1255 """ 1256 connexion_format = self.connexion_format 1257 if connexion_format not in ["duckdb", "sqlite"]: 1258 log.error(f"Unknown connexion format {connexion_format}") 1259 raise ValueError(f"Unknown connexion format {connexion_format}") 1260 else: 1261 return connexion_format 1262 1263 def insert_file_to_table( 1264 self, 1265 file, 1266 columns: str, 1267 header_len: int = 0, 1268 sep: str = "\t", 1269 chunksize: int = 1000000, 1270 ) -> None: 1271 """ 1272 The function reads a file in chunks and inserts each chunk into a table based on the specified 1273 database format. 1274 1275 :param file: The `file` parameter is the file that you want to load into a table. It should be 1276 the path to the file on your system 1277 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1278 should contain the names of the columns in the table where the data will be inserted. The column 1279 names should be separated by commas within the string. For example, if you have columns named 1280 "id", "name 1281 :type columns: str 1282 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1283 the number of lines to skip at the beginning of the file before reading the actual data. This 1284 parameter allows you to skip any header information present in the file before processing the 1285 data, defaults to 0 1286 :type header_len: int (optional) 1287 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1288 separator character that is used in the file being read. In this case, the default separator is 1289 set to `\t`, which represents a tab character. You can change this parameter to a different 1290 separator character if, defaults to \t 1291 :type sep: str (optional) 1292 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1293 when processing the file in chunks. In the provided code snippet, the default value for 1294 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1295 to 1000000 1296 :type chunksize: int (optional) 1297 """ 1298 1299 # Config 1300 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1301 connexion_format = self.get_connexion_format() 1302 1303 log.debug("chunksize: " + str(chunksize)) 1304 1305 if chunksize: 1306 for chunk in pd.read_csv( 1307 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1308 ): 1309 if connexion_format in ["duckdb"]: 1310 sql_insert_into = ( 1311 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1312 ) 1313 self.conn.execute(sql_insert_into) 1314 elif connexion_format in ["sqlite"]: 1315 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1316 1317 def load_data( 1318 self, 1319 input_file: str = None, 1320 drop_variants_table: bool = False, 1321 sample_size: int = 20480, 1322 ) -> None: 1323 """ 1324 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1325 table before loading the data and specify a sample size. 1326 1327 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1328 table 1329 :type input_file: str 1330 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1331 determines whether the variants table should be dropped before loading the data. If set to 1332 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1333 not be dropped, defaults to False 1334 :type drop_variants_table: bool (optional) 1335 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1336 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1337 20480 1338 :type sample_size: int (optional) 1339 """ 1340 1341 log.info("Loading...") 1342 1343 # change input file 1344 if input_file: 1345 self.set_input(input_file) 1346 self.set_header() 1347 1348 # drop variants table 1349 if drop_variants_table: 1350 self.drop_variants_table() 1351 1352 # get table variants 1353 table_variants = self.get_table_variants() 1354 1355 # Access 1356 access = self.get_config().get("access", None) 1357 log.debug(f"access: {access}") 1358 1359 # Input format and compress 1360 input_format = self.get_input_format() 1361 input_compressed = self.get_input_compressed() 1362 log.debug(f"input_format: {input_format}") 1363 log.debug(f"input_compressed: {input_compressed}") 1364 1365 # input_compressed_format 1366 if input_compressed: 1367 input_compressed_format = "gzip" 1368 else: 1369 input_compressed_format = "none" 1370 log.debug(f"input_compressed_format: {input_compressed_format}") 1371 1372 # Connexion format 1373 connexion_format = self.get_connexion_format() 1374 1375 # Sample size 1376 if not sample_size: 1377 sample_size = -1 1378 log.debug(f"sample_size: {sample_size}") 1379 1380 # Load data 1381 log.debug(f"Load Data from {input_format}") 1382 1383 # DuckDB connexion 1384 if connexion_format in ["duckdb"]: 1385 1386 # Database already exists 1387 if self.input_format in ["db", "duckdb"]: 1388 1389 if connexion_format in ["duckdb"]: 1390 log.debug(f"Input file format '{self.input_format}' duckDB") 1391 else: 1392 log.error( 1393 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1394 ) 1395 raise ValueError( 1396 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1397 ) 1398 1399 # Load from existing database format 1400 else: 1401 1402 try: 1403 # Create Table or View 1404 database = Database(database=self.input) 1405 sql_from = database.get_sql_from(sample_size=sample_size) 1406 1407 if access in ["RO"]: 1408 sql_load = ( 1409 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1410 ) 1411 else: 1412 sql_load = ( 1413 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1414 ) 1415 self.conn.execute(sql_load) 1416 1417 except: 1418 # Format not available 1419 log.error(f"Input file format '{self.input_format}' not available") 1420 raise ValueError( 1421 f"Input file format '{self.input_format}' not available" 1422 ) 1423 1424 # SQLite connexion 1425 elif connexion_format in ["sqlite"] and input_format in [ 1426 "vcf", 1427 "tsv", 1428 "csv", 1429 "psv", 1430 ]: 1431 1432 # Main structure 1433 structure = { 1434 "#CHROM": "VARCHAR", 1435 "POS": "INTEGER", 1436 "ID": "VARCHAR", 1437 "REF": "VARCHAR", 1438 "ALT": "VARCHAR", 1439 "QUAL": "VARCHAR", 1440 "FILTER": "VARCHAR", 1441 "INFO": "VARCHAR", 1442 } 1443 1444 # Strcuture with samples 1445 structure_complete = structure 1446 if self.get_header_sample_list(): 1447 structure["FORMAT"] = "VARCHAR" 1448 for sample in self.get_header_sample_list(): 1449 structure_complete[sample] = "VARCHAR" 1450 1451 # Columns list for create and insert 1452 sql_create_table_columns = [] 1453 sql_create_table_columns_list = [] 1454 for column in structure_complete: 1455 column_type = structure_complete[column] 1456 sql_create_table_columns.append( 1457 f'"{column}" {column_type} default NULL' 1458 ) 1459 sql_create_table_columns_list.append(f'"{column}"') 1460 1461 # Create database 1462 log.debug(f"Create Table {table_variants}") 1463 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1464 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1465 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1466 self.conn.execute(sql_create_table) 1467 1468 # chunksize define length of file chunk load file 1469 chunksize = 100000 1470 1471 # delimiter 1472 delimiter = file_format_delimiters.get(input_format, "\t") 1473 1474 # Load the input file 1475 with open(self.input, "rt") as input_file: 1476 1477 # Use the appropriate file handler based on the input format 1478 if input_compressed: 1479 input_file = bgzf.open(self.input, "rt") 1480 if input_format in ["vcf"]: 1481 header_len = self.get_header_length() 1482 else: 1483 header_len = 0 1484 1485 # Insert the file contents into a table 1486 self.insert_file_to_table( 1487 input_file, 1488 columns=sql_create_table_columns_list_sql, 1489 header_len=header_len, 1490 sep=delimiter, 1491 chunksize=chunksize, 1492 ) 1493 1494 else: 1495 log.error( 1496 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1497 ) 1498 raise ValueError( 1499 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1500 ) 1501 1502 # Explode INFOS fields into table fields 1503 if self.get_explode_infos(): 1504 self.explode_infos( 1505 prefix=self.get_explode_infos_prefix(), 1506 fields=self.get_explode_infos_fields(), 1507 force=True, 1508 ) 1509 1510 # Create index after insertion 1511 self.create_indexes() 1512 1513 def get_explode_infos(self) -> bool: 1514 """ 1515 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1516 to False if it is not set. 1517 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1518 value. If the parameter is not present, it will return False. 1519 """ 1520 1521 return self.get_param().get("explode", {}).get("explode_infos", False) 1522 1523 def get_explode_infos_fields( 1524 self, 1525 explode_infos_fields: str = None, 1526 remove_fields_not_in_header: bool = False, 1527 ) -> list: 1528 """ 1529 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1530 the input parameter `explode_infos_fields`. 1531 1532 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1533 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1534 comma-separated list of field names to explode 1535 :type explode_infos_fields: str 1536 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1537 flag that determines whether to remove fields that are not present in the header. If it is set 1538 to `True`, any field that is not in the header will be excluded from the list of exploded 1539 information fields. If it is set to `, defaults to False 1540 :type remove_fields_not_in_header: bool (optional) 1541 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1542 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1543 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1544 Otherwise, it returns a list of exploded information fields after removing any spaces and 1545 splitting the string by commas. 1546 """ 1547 1548 # If no fields, get it in param 1549 if not explode_infos_fields: 1550 explode_infos_fields = ( 1551 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1552 ) 1553 1554 # If no fields, defined as all fields in header using keyword 1555 if not explode_infos_fields: 1556 explode_infos_fields = "*" 1557 1558 # If fields list not empty 1559 if explode_infos_fields: 1560 1561 # Input fields list 1562 if isinstance(explode_infos_fields, str): 1563 fields_input = explode_infos_fields.split(",") 1564 elif isinstance(explode_infos_fields, list): 1565 fields_input = explode_infos_fields 1566 else: 1567 fields_input = [] 1568 1569 # Fields list without * keyword 1570 fields_without_all = fields_input.copy() 1571 if "*".casefold() in (item.casefold() for item in fields_without_all): 1572 fields_without_all.remove("*") 1573 1574 # Fields in header 1575 fields_in_header = sorted(list(set(self.get_header().infos))) 1576 1577 # Construct list of fields 1578 fields_output = [] 1579 for field in fields_input: 1580 1581 # Strip field 1582 field = field.strip() 1583 1584 # format keyword * in regex 1585 if field.upper() in ["*"]: 1586 field = ".*" 1587 1588 # Find all fields with pattern 1589 r = re.compile(rf"^{field}$") 1590 fields_search = sorted(list(filter(r.match, fields_in_header))) 1591 1592 # Remove fields input from search 1593 if field in fields_search: 1594 fields_search = [field] 1595 elif fields_search != [field]: 1596 fields_search = sorted( 1597 list(set(fields_search).difference(fields_input)) 1598 ) 1599 1600 # If field is not in header (avoid not well formatted header) 1601 if not fields_search and not remove_fields_not_in_header: 1602 fields_search = [field] 1603 1604 # Add found fields 1605 for new_field in fields_search: 1606 # Add field, if not already exists, and if it is in header (if asked) 1607 if ( 1608 new_field not in fields_output 1609 and ( 1610 not remove_fields_not_in_header 1611 or new_field in fields_in_header 1612 ) 1613 and new_field not in [".*"] 1614 ): 1615 fields_output.append(new_field) 1616 1617 return fields_output 1618 1619 else: 1620 1621 return [] 1622 1623 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1624 """ 1625 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1626 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1627 not provided. 1628 1629 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1630 prefix to be used for exploding or expanding information 1631 :type explode_infos_prefix: str 1632 :return: the value of the variable `explode_infos_prefix`. 1633 """ 1634 1635 if not explode_infos_prefix: 1636 explode_infos_prefix = ( 1637 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1638 ) 1639 1640 return explode_infos_prefix 1641 1642 def add_column( 1643 self, 1644 table_name, 1645 column_name, 1646 column_type, 1647 default_value=None, 1648 drop: bool = False, 1649 ) -> dict: 1650 """ 1651 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1652 doesn't already exist. 1653 1654 :param table_name: The name of the table to which you want to add a column 1655 :param column_name: The parameter "column_name" is the name of the column that you want to add 1656 to the table 1657 :param column_type: The `column_type` parameter specifies the data type of the column that you 1658 want to add to the table. It should be a string that represents the desired data type, such as 1659 "INTEGER", "TEXT", "REAL", etc 1660 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1661 default value for the newly added column. If a default value is provided, it will be assigned to 1662 the column for any existing rows that do not have a value for that column 1663 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1664 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1665 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1666 to False 1667 :type drop: bool (optional) 1668 :return: a boolean value indicating whether the column was successfully added to the table. 1669 """ 1670 1671 # added 1672 added = False 1673 dropped = False 1674 1675 # Check if the column already exists in the table 1676 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1677 columns = self.get_query_to_df(query).columns.tolist() 1678 if column_name.upper() in [c.upper() for c in columns]: 1679 log.debug( 1680 f"The {column_name} column already exists in the {table_name} table" 1681 ) 1682 if drop: 1683 self.drop_column(table_name=table_name, column_name=column_name) 1684 dropped = True 1685 else: 1686 return None 1687 else: 1688 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1689 1690 # Add column in table 1691 add_column_query = ( 1692 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1693 ) 1694 if default_value is not None: 1695 add_column_query += f" DEFAULT {default_value}" 1696 self.execute_query(add_column_query) 1697 added = not dropped 1698 log.debug( 1699 f"The {column_name} column was successfully added to the {table_name} table" 1700 ) 1701 1702 if added: 1703 added_column = { 1704 "table_name": table_name, 1705 "column_name": column_name, 1706 "column_type": column_type, 1707 "default_value": default_value, 1708 } 1709 else: 1710 added_column = None 1711 1712 return added_column 1713 1714 def drop_column( 1715 self, column: dict = None, table_name: str = None, column_name: str = None 1716 ) -> bool: 1717 """ 1718 The `drop_column` function drops a specified column from a given table in a database and returns 1719 True if the column was successfully dropped, and False if the column does not exist in the 1720 table. 1721 1722 :param column: The `column` parameter is a dictionary that contains information about the column 1723 you want to drop. It has two keys: 1724 :type column: dict 1725 :param table_name: The `table_name` parameter is the name of the table from which you want to 1726 drop a column 1727 :type table_name: str 1728 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1729 from the table 1730 :type column_name: str 1731 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1732 and False if the column does not exist in the table. 1733 """ 1734 1735 # Find column infos 1736 if column: 1737 if isinstance(column, dict): 1738 table_name = column.get("table_name", None) 1739 column_name = column.get("column_name", None) 1740 elif isinstance(column, str): 1741 table_name = self.get_table_variants() 1742 column_name = column 1743 else: 1744 table_name = None 1745 column_name = None 1746 1747 if not table_name and not column_name: 1748 return False 1749 1750 # Removed 1751 removed = False 1752 1753 # Check if the column already exists in the table 1754 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1755 columns = self.get_query_to_df(query).columns.tolist() 1756 if column_name in columns: 1757 log.debug(f"The {column_name} column exists in the {table_name} table") 1758 else: 1759 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1760 return False 1761 1762 # Add column in table # ALTER TABLE integers DROP k 1763 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1764 self.execute_query(add_column_query) 1765 removed = True 1766 log.debug( 1767 f"The {column_name} column was successfully dropped to the {table_name} table" 1768 ) 1769 1770 return removed 1771 1772 def explode_infos( 1773 self, 1774 prefix: str = None, 1775 create_index: bool = False, 1776 fields: list = None, 1777 force: bool = False, 1778 proccess_all_fields_together: bool = False, 1779 table: str = None, 1780 ) -> list: 1781 """ 1782 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1783 individual columns, returning a list of added columns. 1784 1785 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1786 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1787 `self.get_explode_infos_prefix()` as the prefix 1788 :type prefix: str 1789 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1790 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1791 `False`, indexes will not be created. The default value is `False`, defaults to False 1792 :type create_index: bool (optional) 1793 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1794 that you want to explode into individual columns. If this parameter is not provided, all INFO 1795 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1796 a list to the ` 1797 :type fields: list 1798 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1799 determines whether to drop and recreate a column if it already exists in the table. If `force` 1800 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1801 defaults to False 1802 :type force: bool (optional) 1803 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1804 flag that determines whether to process all the INFO fields together or individually. If set to 1805 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1806 be processed individually. The default value is, defaults to False 1807 :type proccess_all_fields_together: bool (optional) 1808 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1809 of the table where the exploded INFO fields will be added as individual columns. If you provide 1810 a value for the `table` parameter, the function will use that table name. If the `table` 1811 parameter is 1812 :type table: str 1813 :return: The `explode_infos` function returns a list of added columns. 1814 """ 1815 1816 # drop indexes 1817 self.drop_indexes() 1818 1819 # connexion format 1820 connexion_format = self.get_connexion_format() 1821 1822 # Access 1823 access = self.get_config().get("access", None) 1824 1825 # Added columns 1826 added_columns = [] 1827 1828 if access not in ["RO"]: 1829 1830 # prefix 1831 if prefix in [None, True] or not isinstance(prefix, str): 1832 if self.get_explode_infos_prefix() not in [None, True]: 1833 prefix = self.get_explode_infos_prefix() 1834 else: 1835 prefix = "INFO/" 1836 1837 # table variants 1838 if table is not None: 1839 table_variants = table 1840 else: 1841 table_variants = self.get_table_variants(clause="select") 1842 1843 # extra infos 1844 try: 1845 extra_infos = self.get_extra_infos() 1846 except: 1847 extra_infos = [] 1848 1849 # Header infos 1850 header_infos = self.get_header().infos 1851 1852 log.debug( 1853 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1854 ) 1855 1856 sql_info_alter_table_array = [] 1857 1858 # Info fields to check 1859 fields_list = list(header_infos) 1860 if fields: 1861 fields_list += fields 1862 fields_list = set(fields_list) 1863 1864 # If no fields 1865 if not fields: 1866 fields = [] 1867 1868 # Translate fields if patterns 1869 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1870 1871 for info in fields: 1872 1873 info_id_sql = prefix + info 1874 1875 if ( 1876 info in fields_list 1877 or prefix + info in fields_list 1878 or info in extra_infos 1879 ): 1880 1881 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1882 1883 if info in header_infos: 1884 info_type = header_infos[info].type 1885 info_num = header_infos[info].num 1886 else: 1887 info_type = "String" 1888 info_num = 0 1889 1890 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1891 if info_num != 1: 1892 type_sql = "VARCHAR" 1893 1894 # Add field 1895 added_column = self.add_column( 1896 table_name=table_variants, 1897 column_name=info_id_sql, 1898 column_type=type_sql, 1899 default_value="null", 1900 drop=force, 1901 ) 1902 1903 if added_column: 1904 added_columns.append(added_column) 1905 1906 if added_column or force: 1907 1908 # add field to index 1909 self.index_additionnal_fields.append(info_id_sql) 1910 1911 # Update field array 1912 if connexion_format in ["duckdb"]: 1913 update_info_field = f""" 1914 "{info_id_sql}" = 1915 CASE 1916 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1917 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1918 END 1919 """ 1920 elif connexion_format in ["sqlite"]: 1921 update_info_field = f""" 1922 "{info_id_sql}" = 1923 CASE 1924 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1925 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1926 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1927 END 1928 """ 1929 1930 sql_info_alter_table_array.append(update_info_field) 1931 1932 if sql_info_alter_table_array: 1933 1934 # By chromosomes 1935 try: 1936 chromosomes_list = list( 1937 self.get_query_to_df( 1938 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1939 )["#CHROM"] 1940 ) 1941 except: 1942 chromosomes_list = [None] 1943 1944 for chrom in chromosomes_list: 1945 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1946 1947 # Where clause 1948 where_clause = "" 1949 if chrom and len(chromosomes_list) > 1: 1950 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1951 1952 # Update table 1953 if proccess_all_fields_together: 1954 sql_info_alter_table_array_join = ", ".join( 1955 sql_info_alter_table_array 1956 ) 1957 if sql_info_alter_table_array_join: 1958 sql_info_alter_table = f""" 1959 UPDATE {table_variants} 1960 SET {sql_info_alter_table_array_join} 1961 {where_clause} 1962 """ 1963 log.debug( 1964 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1965 ) 1966 # log.debug(sql_info_alter_table) 1967 self.conn.execute(sql_info_alter_table) 1968 else: 1969 sql_info_alter_num = 0 1970 for sql_info_alter in sql_info_alter_table_array: 1971 sql_info_alter_num += 1 1972 sql_info_alter_table = f""" 1973 UPDATE {table_variants} 1974 SET {sql_info_alter} 1975 {where_clause} 1976 """ 1977 log.debug( 1978 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1979 ) 1980 # log.debug(sql_info_alter_table) 1981 self.conn.execute(sql_info_alter_table) 1982 1983 # create indexes 1984 if create_index: 1985 self.create_indexes() 1986 1987 return added_columns 1988 1989 def create_indexes(self) -> None: 1990 """ 1991 Create indexes on the table after insertion 1992 """ 1993 1994 # Access 1995 access = self.get_config().get("access", None) 1996 1997 # get table variants 1998 table_variants = self.get_table_variants("FROM") 1999 2000 if self.get_indexing() and access not in ["RO"]: 2001 # Create index 2002 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 2003 self.conn.execute(sql_create_table_index) 2004 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 2005 self.conn.execute(sql_create_table_index) 2006 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 2007 self.conn.execute(sql_create_table_index) 2008 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 2009 self.conn.execute(sql_create_table_index) 2010 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 2011 self.conn.execute(sql_create_table_index) 2012 for field in self.index_additionnal_fields: 2013 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 2014 self.conn.execute(sql_create_table_index) 2015 2016 def drop_indexes(self) -> None: 2017 """ 2018 Create indexes on the table after insertion 2019 """ 2020 2021 # Access 2022 access = self.get_config().get("access", None) 2023 2024 # get table variants 2025 table_variants = self.get_table_variants("FROM") 2026 2027 # Get database format 2028 connexion_format = self.get_connexion_format() 2029 2030 if access not in ["RO"]: 2031 if connexion_format in ["duckdb"]: 2032 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2033 elif connexion_format in ["sqlite"]: 2034 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2035 2036 list_indexes = self.conn.execute(sql_list_indexes) 2037 index_names = [row[0] for row in list_indexes.fetchall()] 2038 for index in index_names: 2039 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2040 self.conn.execute(sql_drop_table_index) 2041 2042 def read_vcf_header(self, f) -> list: 2043 """ 2044 It reads the header of a VCF file and returns a list of the header lines 2045 2046 :param f: the file object 2047 :return: The header lines of the VCF file. 2048 """ 2049 2050 header_list = [] 2051 for line in f: 2052 header_list.append(line) 2053 if line.startswith("#CHROM"): 2054 break 2055 return header_list 2056 2057 def read_vcf_header_file(self, file: str = None) -> list: 2058 """ 2059 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2060 uncompressed files. 2061 2062 :param file: The `file` parameter is a string that represents the path to the VCF header file 2063 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2064 default to `None` 2065 :type file: str 2066 :return: The function `read_vcf_header_file` returns a list. 2067 """ 2068 2069 if self.get_input_compressed(input_file=file): 2070 with bgzf.open(file, "rt") as f: 2071 return self.read_vcf_header(f=f) 2072 else: 2073 with open(file, "rt") as f: 2074 return self.read_vcf_header(f=f) 2075 2076 def execute_query(self, query: str): 2077 """ 2078 It takes a query as an argument, executes it, and returns the results 2079 2080 :param query: The query to be executed 2081 :return: The result of the query is being returned. 2082 """ 2083 if query: 2084 return self.conn.execute(query) # .fetchall() 2085 else: 2086 return None 2087 2088 def export_output( 2089 self, 2090 output_file: str | None = None, 2091 output_header: str | None = None, 2092 export_header: bool = True, 2093 query: str | None = None, 2094 parquet_partitions: list | None = None, 2095 chunk_size: int | None = None, 2096 threads: int | None = None, 2097 sort: bool = False, 2098 index: bool = False, 2099 order_by: str | None = None, 2100 fields_to_rename: dict | None = None, 2101 ) -> bool: 2102 """ 2103 The `export_output` function exports data from a VCF file to various formats, including VCF, 2104 CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and 2105 partitioning. 2106 2107 :param output_file: The `output_file` parameter is a string that specifies the name of the 2108 output file where the exported data will be saved 2109 :type output_file: str | None 2110 :param output_header: The `output_header` parameter is a string that specifies the name of the 2111 file where the header of the VCF file will be exported. If this parameter is not provided, the 2112 header will be exported to a file with the same name as the `output_file` parameter, but with 2113 the extension " 2114 :type output_header: str | None 2115 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2116 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2117 True, the header will be exported to a file. If `export_header` is False, the header will not 2118 be, defaults to True 2119 :type export_header: bool (optional) 2120 :param query: The `query` parameter in the `export_output` function is an optional SQL query 2121 that can be used to filter and select specific data from the VCF file before exporting it. If 2122 provided, only the data that matches the query will be exported. This allows you to customize 2123 the exported data based on 2124 :type query: str | None 2125 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2126 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2127 organize data in a hierarchical directory structure based on the values of one or more columns. 2128 This can improve query performance when working with large datasets 2129 :type parquet_partitions: list | None 2130 :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when 2131 exporting data in Parquet format. This parameter is used for partitioning the Parquet file into 2132 multiple files. It helps in optimizing the export process by breaking down the data into 2133 manageable chunks for processing and storage 2134 :type chunk_size: int | None 2135 :param threads: The `threads` parameter in the `export_output` function specifies the number of 2136 threads to be used during the export process. It determines the level of parallelism and can 2137 improve the performance of the export operation. If this parameter is not provided, the function 2138 will use the default number of threads 2139 :type threads: int | None 2140 :param sort: The `sort` parameter in the `export_output` function is a boolean flag that 2141 determines whether the output file should be sorted based on genomic coordinates of the 2142 variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to 2143 `False`,, defaults to False 2144 :type sort: bool (optional) 2145 :param index: The `index` parameter in the `export_output` function is a boolean flag that 2146 determines whether an index should be created on the output file. If `index` is set to `True`, 2147 an index will be created on the output file. If `index` is set to `False`, no, defaults to False 2148 :type index: bool (optional) 2149 :param order_by: The `order_by` parameter in the `export_output` function is a string that 2150 specifies the column(s) to use for sorting the output file. This parameter is only applicable 2151 when exporting data in VCF format. It allows you to specify the column(s) based on which the 2152 output file should be 2153 :type order_by: str | None 2154 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the 2155 mapping of field names to be renamed during the export process. This parameter allows you to 2156 customize the output field names before exporting the data. Each key-value pair in the 2157 dictionary represents the original field name as the key and the new field name 2158 :type fields_to_rename: dict | None 2159 :return: The `export_output` function returns a boolean value. It checks if the output file 2160 exists and returns True if it does, or None if it doesn't. 2161 """ 2162 2163 # Log 2164 log.info("Exporting...") 2165 2166 # Full path 2167 output_file = full_path(output_file) 2168 output_header = full_path(output_header) 2169 2170 # Config 2171 config = self.get_config() 2172 2173 # Param 2174 param = self.get_param() 2175 2176 # Tmp files to remove 2177 tmp_to_remove = [] 2178 2179 # If no output, get it 2180 if not output_file: 2181 output_file = self.get_output() 2182 2183 # If not threads 2184 if not threads: 2185 threads = self.get_threads() 2186 2187 # Rename fields 2188 if not fields_to_rename: 2189 fields_to_rename = param.get("export", {}).get("fields_to_rename", None) 2190 self.rename_info_fields(fields_to_rename=fields_to_rename) 2191 2192 # Auto header name with extension 2193 if export_header or output_header: 2194 if not output_header: 2195 output_header = f"{output_file}.hdr" 2196 # Export header 2197 self.export_header(output_file=output_file) 2198 2199 # Switch off export header if VCF output 2200 output_file_type = get_file_format(output_file) 2201 if output_file_type in ["vcf"]: 2202 export_header = False 2203 tmp_to_remove.append(output_header) 2204 2205 # Chunk size 2206 if not chunk_size: 2207 chunk_size = config.get("chunk_size", None) 2208 2209 # Parquet partition 2210 if not parquet_partitions: 2211 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2212 if parquet_partitions and isinstance(parquet_partitions, str): 2213 parquet_partitions = parquet_partitions.split(",") 2214 2215 # Order by 2216 if not order_by: 2217 order_by = param.get("export", {}).get("order_by", "") 2218 2219 # Header in output 2220 header_in_output = param.get("export", {}).get("include_header", False) 2221 2222 # Database 2223 database_source = self.get_connexion() 2224 2225 # Connexion format 2226 connexion_format = self.get_connexion_format() 2227 2228 # Explode infos 2229 if self.get_explode_infos(): 2230 self.explode_infos( 2231 prefix=self.get_explode_infos_prefix(), 2232 fields=self.get_explode_infos_fields(), 2233 force=False, 2234 ) 2235 2236 # if connexion_format in ["sqlite"] or query: 2237 if connexion_format in ["sqlite"]: 2238 2239 # Export in Parquet 2240 random_tmp = "".join( 2241 random.choice(string.ascii_lowercase) for i in range(10) 2242 ) 2243 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2244 tmp_to_remove.append(database_source) 2245 2246 # Table Variants 2247 table_variants = self.get_table_variants() 2248 2249 # Create export query 2250 sql_query_export_subquery = f""" 2251 SELECT * FROM {table_variants} 2252 """ 2253 2254 # Write source file 2255 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2256 2257 # Create database 2258 database = Database( 2259 database=database_source, 2260 table="variants", 2261 header_file=output_header, 2262 conn_config=self.get_connexion_config(), 2263 ) 2264 2265 # Existing colomns header 2266 existing_columns_header = database.get_header_columns_from_database(query=query) 2267 2268 # Sample list 2269 if output_file_type in ["vcf"]: 2270 get_samples = self.get_samples() 2271 get_samples_check = self.get_samples_check() 2272 samples_force = get_samples is not None 2273 sample_list = self.get_header_sample_list( 2274 check=get_samples_check, 2275 samples=get_samples, 2276 samples_force=samples_force, 2277 ) 2278 else: 2279 sample_list = None 2280 2281 # Export file 2282 database.export( 2283 output_database=output_file, 2284 output_header=output_header, 2285 existing_columns_header=existing_columns_header, 2286 parquet_partitions=parquet_partitions, 2287 chunk_size=chunk_size, 2288 threads=threads, 2289 sort=sort, 2290 index=index, 2291 header_in_output=header_in_output, 2292 order_by=order_by, 2293 query=query, 2294 export_header=export_header, 2295 sample_list=sample_list, 2296 ) 2297 2298 # Remove 2299 remove_if_exists(tmp_to_remove) 2300 2301 return (os.path.exists(output_file) or None) and ( 2302 os.path.exists(output_file) or None 2303 ) 2304 2305 def get_extra_infos(self, table: str = None) -> list: 2306 """ 2307 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2308 in the header. 2309 2310 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2311 name of the table from which you want to retrieve the extra columns that are not present in the 2312 header. If the `table` parameter is not provided when calling the function, it will default to 2313 using the variants 2314 :type table: str 2315 :return: A list of columns that are in the specified table but not in the header of the table. 2316 """ 2317 2318 header_columns = [] 2319 2320 if not table: 2321 table = self.get_table_variants(clause="from") 2322 header_columns = self.get_header_columns() 2323 2324 # Check all columns in the database 2325 query = f""" SELECT * FROM {table} LIMIT 1 """ 2326 log.debug(f"query {query}") 2327 table_columns = self.get_query_to_df(query).columns.tolist() 2328 extra_columns = [] 2329 2330 # Construct extra infos (not in header) 2331 for column in table_columns: 2332 if column not in header_columns: 2333 extra_columns.append(column) 2334 2335 return extra_columns 2336 2337 def get_extra_infos_sql(self, table: str = None) -> str: 2338 """ 2339 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2340 by double quotes 2341 2342 :param table: The name of the table to get the extra infos from. If None, the default table is 2343 used 2344 :type table: str 2345 :return: A string of the extra infos 2346 """ 2347 2348 return ", ".join( 2349 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2350 ) 2351 2352 def export_header( 2353 self, 2354 header_name: str = None, 2355 output_file: str = None, 2356 output_file_ext: str = ".hdr", 2357 clean_header: bool = True, 2358 remove_chrom_line: bool = False, 2359 ) -> str: 2360 """ 2361 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2362 specified options, and writes it to a new file. 2363 2364 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2365 this parameter is not specified, the header will be written to the output file 2366 :type header_name: str 2367 :param output_file: The `output_file` parameter in the `export_header` function is used to 2368 specify the name of the output file where the header will be written. If this parameter is not 2369 provided, the header will be written to a temporary file 2370 :type output_file: str 2371 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2372 string that represents the extension of the output header file. By default, it is set to ".hdr" 2373 if not specified by the user. This extension will be appended to the `output_file` name to 2374 create the final, defaults to .hdr 2375 :type output_file_ext: str (optional) 2376 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2377 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2378 `True`, the function will clean the header by modifying certain lines based on a specific 2379 pattern. If `clean_header`, defaults to True 2380 :type clean_header: bool (optional) 2381 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2382 boolean flag that determines whether the #CHROM line should be removed from the header before 2383 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2384 defaults to False 2385 :type remove_chrom_line: bool (optional) 2386 :return: The function `export_header` returns the name of the temporary header file that is 2387 created. 2388 """ 2389 2390 if not header_name and not output_file: 2391 output_file = self.get_output() 2392 2393 if self.get_header(): 2394 2395 # Get header object 2396 header_obj = self.get_header() 2397 2398 # Create database 2399 db_for_header = Database(database=self.get_input()) 2400 2401 # Get real columns in the file 2402 db_header_columns = db_for_header.get_columns() 2403 2404 with tempfile.TemporaryDirectory() as tmpdir: 2405 2406 # Write header file 2407 header_file_tmp = os.path.join(tmpdir, "header") 2408 f = open(header_file_tmp, "w") 2409 vcf.Writer(f, header_obj) 2410 f.close() 2411 2412 # Replace #CHROM line with rel columns 2413 header_list = db_for_header.read_header_file( 2414 header_file=header_file_tmp 2415 ) 2416 header_list[-1] = "\t".join(db_header_columns) 2417 2418 # Remove CHROM line 2419 if remove_chrom_line: 2420 header_list.pop() 2421 2422 # Clean header 2423 if clean_header: 2424 header_list_clean = [] 2425 for head in header_list: 2426 # Clean head for malformed header 2427 head_clean = head 2428 head_clean = re.subn( 2429 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2430 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2431 head_clean, 2432 2, 2433 )[0] 2434 # Write header 2435 header_list_clean.append(head_clean) 2436 header_list = header_list_clean 2437 2438 tmp_header_name = output_file + output_file_ext 2439 2440 f = open(tmp_header_name, "w") 2441 for line in header_list: 2442 f.write(line) 2443 f.close() 2444 2445 return tmp_header_name 2446 2447 def export_variant_vcf( 2448 self, 2449 vcf_file, 2450 remove_info: bool = False, 2451 add_samples: bool = True, 2452 list_samples: list = [], 2453 where_clause: str = "", 2454 index: bool = False, 2455 threads: int | None = None, 2456 ) -> bool | None: 2457 """ 2458 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2459 remove INFO field, add samples, and control compression and indexing. 2460 2461 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2462 written to. It is the output file that will contain the filtered VCF data based on the specified 2463 parameters 2464 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2465 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2466 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2467 in, defaults to False 2468 :type remove_info: bool (optional) 2469 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2470 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2471 If set to False, the samples will be removed. The default value is True, defaults to True 2472 :type add_samples: bool (optional) 2473 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2474 in the output VCF file. By default, all samples will be included. If you provide a list of 2475 samples, only those samples will be included in the output file 2476 :type list_samples: list 2477 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2478 determines whether or not to create an index for the output VCF file. If `index` is set to 2479 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2480 :type index: bool (optional) 2481 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2482 number of threads to use for exporting the VCF file. It determines how many parallel threads 2483 will be used during the export process. More threads can potentially speed up the export process 2484 by utilizing multiple cores of the processor. If 2485 :type threads: int | None 2486 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2487 method with various parameters including the output file, query, threads, sort flag, and index 2488 flag. The `export_output` method is responsible for exporting the VCF data based on the 2489 specified parameters and configurations provided in the `export_variant_vcf` function. 2490 """ 2491 2492 # Config 2493 config = self.get_config() 2494 2495 # Extract VCF 2496 log.debug("Export VCF...") 2497 2498 # Table variants 2499 table_variants = self.get_table_variants() 2500 2501 # Threads 2502 if not threads: 2503 threads = self.get_threads() 2504 2505 # Info fields 2506 if remove_info: 2507 if not isinstance(remove_info, str): 2508 remove_info = "." 2509 info_field = f"""'{remove_info}' as INFO""" 2510 else: 2511 info_field = "INFO" 2512 2513 # Samples fields 2514 if add_samples: 2515 if not list_samples: 2516 list_samples = self.get_header_sample_list() 2517 if list_samples: 2518 samples_fields = " , FORMAT , " + " , ".join( 2519 [f""" "{sample}" """ for sample in list_samples] 2520 ) 2521 else: 2522 samples_fields = "" 2523 log.debug(f"samples_fields: {samples_fields}") 2524 else: 2525 samples_fields = "" 2526 2527 # Where clause 2528 if where_clause is None: 2529 where_clause = "" 2530 2531 # Variants 2532 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2533 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2534 log.debug(f"sql_query_select={sql_query_select}") 2535 2536 return self.export_output( 2537 output_file=vcf_file, 2538 output_header=None, 2539 export_header=True, 2540 query=sql_query_select, 2541 parquet_partitions=None, 2542 chunk_size=config.get("chunk_size", None), 2543 threads=threads, 2544 sort=True, 2545 index=index, 2546 order_by=None, 2547 ) 2548 2549 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2550 """ 2551 It takes a list of commands and runs them in parallel using the number of threads specified 2552 2553 :param commands: A list of commands to run 2554 :param threads: The number of threads to use, defaults to 1 (optional) 2555 """ 2556 2557 run_parallel_commands(commands, threads) 2558 2559 def get_threads(self, default: int = 1) -> int: 2560 """ 2561 This function returns the number of threads to use for a job, with a default value of 1 if not 2562 specified. 2563 2564 :param default: The `default` parameter in the `get_threads` method is used to specify the 2565 default number of threads to use if no specific value is provided. If no value is provided for 2566 the `threads` parameter in the configuration or input parameters, the `default` value will be 2567 used, defaults to 1 2568 :type default: int (optional) 2569 :return: the number of threads to use for the current job. 2570 """ 2571 2572 # Config 2573 config = self.get_config() 2574 2575 # Param 2576 param = self.get_param() 2577 2578 # Input threads 2579 input_thread = param.get("threads", config.get("threads", None)) 2580 2581 # Check threads 2582 if not input_thread: 2583 threads = default 2584 elif int(input_thread) <= 0: 2585 threads = os.cpu_count() 2586 else: 2587 threads = int(input_thread) 2588 return threads 2589 2590 def get_memory(self, default: str = None) -> str: 2591 """ 2592 This function retrieves the memory value from parameters or configuration with a default value 2593 if not found. 2594 2595 :param default: The `get_memory` function takes in a default value as a string parameter. This 2596 default value is used as a fallback in case the `memory` parameter is not provided in the 2597 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2598 the function 2599 :type default: str 2600 :return: The `get_memory` function returns a string value representing the memory parameter. If 2601 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2602 return the default value provided as an argument to the function. 2603 """ 2604 2605 # Config 2606 config = self.get_config() 2607 2608 # Param 2609 param = self.get_param() 2610 2611 # Input threads 2612 input_memory = param.get("memory", config.get("memory", None)) 2613 2614 # Check threads 2615 if input_memory: 2616 memory = input_memory 2617 else: 2618 memory = default 2619 2620 return memory 2621 2622 def update_from_vcf(self, vcf_file: str) -> None: 2623 """ 2624 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2625 2626 :param vcf_file: the path to the VCF file 2627 """ 2628 2629 connexion_format = self.get_connexion_format() 2630 2631 if connexion_format in ["duckdb"]: 2632 self.update_from_vcf_duckdb(vcf_file) 2633 elif connexion_format in ["sqlite"]: 2634 self.update_from_vcf_sqlite(vcf_file) 2635 2636 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2637 """ 2638 It takes a VCF file and updates the INFO column of the variants table in the database with the 2639 INFO column of the VCF file 2640 2641 :param vcf_file: the path to the VCF file 2642 """ 2643 2644 # varaints table 2645 table_variants = self.get_table_variants() 2646 2647 # Loading VCF into temporaire table 2648 skip = self.get_header_length(file=vcf_file) 2649 vcf_df = pd.read_csv( 2650 vcf_file, 2651 sep="\t", 2652 engine="c", 2653 skiprows=skip, 2654 header=0, 2655 low_memory=False, 2656 ) 2657 sql_query_update = f""" 2658 UPDATE {table_variants} as table_variants 2659 SET INFO = concat( 2660 CASE 2661 WHEN INFO NOT IN ('', '.') 2662 THEN INFO 2663 ELSE '' 2664 END, 2665 ( 2666 SELECT 2667 concat( 2668 CASE 2669 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2670 THEN ';' 2671 ELSE '' 2672 END 2673 , 2674 CASE 2675 WHEN table_parquet.INFO NOT IN ('','.') 2676 THEN table_parquet.INFO 2677 ELSE '' 2678 END 2679 ) 2680 FROM vcf_df as table_parquet 2681 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2682 AND table_parquet.\"POS\" = table_variants.\"POS\" 2683 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2684 AND table_parquet.\"REF\" = table_variants.\"REF\" 2685 AND table_parquet.INFO NOT IN ('','.') 2686 ) 2687 ) 2688 ; 2689 """ 2690 self.conn.execute(sql_query_update) 2691 2692 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2693 """ 2694 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2695 table, then updates the INFO column of the variants table with the INFO column of the temporary 2696 table 2697 2698 :param vcf_file: The path to the VCF file you want to update the database with 2699 """ 2700 2701 # Create a temporary table for the VCF 2702 table_vcf = "tmp_vcf" 2703 sql_create = ( 2704 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2705 ) 2706 self.conn.execute(sql_create) 2707 2708 # Loading VCF into temporaire table 2709 vcf_df = pd.read_csv( 2710 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2711 ) 2712 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2713 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2714 2715 # Update table 'variants' with VCF data 2716 # warning: CONCAT as || operator 2717 sql_query_update = f""" 2718 UPDATE variants as table_variants 2719 SET INFO = CASE 2720 WHEN INFO NOT IN ('', '.') 2721 THEN INFO 2722 ELSE '' 2723 END || 2724 ( 2725 SELECT 2726 CASE 2727 WHEN table_variants.INFO NOT IN ('','.') 2728 AND table_vcf.INFO NOT IN ('','.') 2729 THEN ';' 2730 ELSE '' 2731 END || 2732 CASE 2733 WHEN table_vcf.INFO NOT IN ('','.') 2734 THEN table_vcf.INFO 2735 ELSE '' 2736 END 2737 FROM {table_vcf} as table_vcf 2738 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2739 AND table_vcf.\"POS\" = table_variants.\"POS\" 2740 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2741 AND table_vcf.\"REF\" = table_variants.\"REF\" 2742 ) 2743 """ 2744 self.conn.execute(sql_query_update) 2745 2746 # Drop temporary table 2747 sql_drop = f"DROP TABLE {table_vcf}" 2748 self.conn.execute(sql_drop) 2749 2750 def drop_variants_table(self) -> None: 2751 """ 2752 > This function drops the variants table 2753 """ 2754 2755 table_variants = self.get_table_variants() 2756 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2757 self.conn.execute(sql_table_variants) 2758 2759 def set_variant_id( 2760 self, variant_id_column: str = "variant_id", force: bool = None 2761 ) -> str: 2762 """ 2763 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2764 `#CHROM`, `POS`, `REF`, and `ALT` columns 2765 2766 :param variant_id_column: The name of the column to be created in the variants table, defaults 2767 to variant_id 2768 :type variant_id_column: str (optional) 2769 :param force: If True, the variant_id column will be created even if it already exists 2770 :type force: bool 2771 :return: The name of the column that contains the variant_id 2772 """ 2773 2774 # Assembly 2775 assembly = self.get_param().get( 2776 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2777 ) 2778 2779 # INFO/Tag prefix 2780 prefix = self.get_explode_infos_prefix() 2781 2782 # Explode INFO/SVTYPE 2783 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2784 2785 # variants table 2786 table_variants = self.get_table_variants() 2787 2788 # variant_id column 2789 if not variant_id_column: 2790 variant_id_column = "variant_id" 2791 2792 # Creta variant_id column 2793 if "variant_id" not in self.get_extra_infos() or force: 2794 2795 # Create column 2796 self.add_column( 2797 table_name=table_variants, 2798 column_name=variant_id_column, 2799 column_type="UBIGINT", 2800 default_value="0", 2801 ) 2802 2803 # Update column 2804 self.conn.execute( 2805 f""" 2806 UPDATE {table_variants} 2807 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2808 """ 2809 ) 2810 2811 # Remove added columns 2812 for added_column in added_columns: 2813 self.drop_column(column=added_column) 2814 2815 # return variant_id column name 2816 return variant_id_column 2817 2818 def get_variant_id_column( 2819 self, variant_id_column: str = "variant_id", force: bool = None 2820 ) -> str: 2821 """ 2822 This function returns the variant_id column name 2823 2824 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2825 defaults to variant_id 2826 :type variant_id_column: str (optional) 2827 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2828 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2829 if it is not already set, or if it is set 2830 :type force: bool 2831 :return: The variant_id column name. 2832 """ 2833 2834 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2835 2836 ### 2837 # Annotation 2838 ### 2839 2840 def scan_databases( 2841 self, 2842 database_formats: list = ["parquet"], 2843 database_releases: list = ["current"], 2844 ) -> dict: 2845 """ 2846 The function `scan_databases` scans for available databases based on specified formats and 2847 releases. 2848 2849 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2850 of the databases to be scanned. In this case, the accepted format is "parquet" 2851 :type database_formats: list ["parquet"] 2852 :param database_releases: The `database_releases` parameter is a list that specifies the 2853 releases of the databases to be scanned. In the provided function, the default value for 2854 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2855 databases that are in the "current" 2856 :type database_releases: list 2857 :return: The function `scan_databases` returns a dictionary containing information about 2858 databases that match the specified formats and releases. 2859 """ 2860 2861 # Config 2862 config = self.get_config() 2863 2864 # Param 2865 param = self.get_param() 2866 2867 # Param - Assembly 2868 assembly = param.get("assembly", config.get("assembly", None)) 2869 if not assembly: 2870 assembly = DEFAULT_ASSEMBLY 2871 log.warning(f"Default assembly '{assembly}'") 2872 2873 # Scan for availabled databases 2874 log.info( 2875 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2876 ) 2877 databases_infos_dict = databases_infos( 2878 database_folder_releases=database_releases, 2879 database_formats=database_formats, 2880 assembly=assembly, 2881 config=config, 2882 ) 2883 log.info( 2884 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2885 ) 2886 2887 return databases_infos_dict 2888 2889 def annotation(self) -> None: 2890 """ 2891 It annotates the VCF file with the annotations specified in the config file. 2892 """ 2893 2894 # Config 2895 config = self.get_config() 2896 2897 # Param 2898 param = self.get_param() 2899 2900 # Param - Assembly 2901 assembly = param.get("assembly", config.get("assembly", None)) 2902 if not assembly: 2903 assembly = DEFAULT_ASSEMBLY 2904 log.warning(f"Default assembly '{assembly}'") 2905 2906 # annotations databases folders 2907 annotations_databases = set( 2908 config.get("folders", {}) 2909 .get("databases", {}) 2910 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2911 + config.get("folders", {}) 2912 .get("databases", {}) 2913 .get("parquet", ["~/howard/databases/parquet/current"]) 2914 + config.get("folders", {}) 2915 .get("databases", {}) 2916 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2917 ) 2918 2919 # Get param annotations 2920 if param.get("annotations", None) and isinstance( 2921 param.get("annotations", None), str 2922 ): 2923 log.debug(param.get("annotations", None)) 2924 param_annotation_list = param.get("annotations").split(",") 2925 else: 2926 param_annotation_list = [] 2927 2928 # Each tools param 2929 if param.get("annotation_parquet", None) != None: 2930 log.debug( 2931 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2932 ) 2933 if isinstance(param.get("annotation_parquet", None), list): 2934 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2935 else: 2936 param_annotation_list.append(param.get("annotation_parquet")) 2937 if param.get("annotation_snpsift", None) != None: 2938 if isinstance(param.get("annotation_snpsift", None), list): 2939 param_annotation_list.append( 2940 "snpsift:" 2941 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2942 ) 2943 else: 2944 param_annotation_list.append( 2945 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2946 ) 2947 if param.get("annotation_snpeff", None) != None: 2948 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2949 if param.get("annotation_bcftools", None) != None: 2950 if isinstance(param.get("annotation_bcftools", None), list): 2951 param_annotation_list.append( 2952 "bcftools:" 2953 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2954 ) 2955 else: 2956 param_annotation_list.append( 2957 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2958 ) 2959 if param.get("annotation_annovar", None) != None: 2960 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2961 if param.get("annotation_exomiser", None) != None: 2962 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2963 if param.get("annotation_splice", None) != None: 2964 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2965 2966 # Merge param annotations list 2967 param["annotations"] = ",".join(param_annotation_list) 2968 2969 # debug 2970 log.debug(f"param_annotations={param['annotations']}") 2971 2972 if param.get("annotations"): 2973 2974 # Log 2975 # log.info("Annotations - Check annotation parameters") 2976 2977 if not "annotation" in param: 2978 param["annotation"] = {} 2979 2980 # List of annotations parameters 2981 annotations_list_input = {} 2982 if isinstance(param.get("annotations", None), str): 2983 annotation_file_list = [ 2984 value for value in param.get("annotations", "").split(",") 2985 ] 2986 for annotation_file in annotation_file_list: 2987 annotations_list_input[annotation_file.strip()] = {"INFO": None} 2988 else: 2989 annotations_list_input = param.get("annotations", {}) 2990 2991 log.info(f"Quick Annotations:") 2992 for annotation_key in list(annotations_list_input.keys()): 2993 log.info(f" {annotation_key}") 2994 2995 # List of annotations and associated fields 2996 annotations_list = {} 2997 2998 for annotation_file in annotations_list_input: 2999 3000 # Explode annotations if ALL 3001 if ( 3002 annotation_file.upper() == "ALL" 3003 or annotation_file.upper().startswith("ALL:") 3004 ): 3005 3006 # check ALL parameters (formats, releases) 3007 annotation_file_split = annotation_file.split(":") 3008 database_formats = "parquet" 3009 database_releases = "current" 3010 for annotation_file_option in annotation_file_split[1:]: 3011 database_all_options_split = annotation_file_option.split("=") 3012 if database_all_options_split[0] == "format": 3013 database_formats = database_all_options_split[1].split("+") 3014 if database_all_options_split[0] == "release": 3015 database_releases = database_all_options_split[1].split("+") 3016 3017 # Scan for availabled databases 3018 databases_infos_dict = self.scan_databases( 3019 database_formats=database_formats, 3020 database_releases=database_releases, 3021 ) 3022 3023 # Add found databases in annotation parameters 3024 for database_infos in databases_infos_dict.keys(): 3025 annotations_list[database_infos] = {"INFO": None} 3026 3027 else: 3028 annotations_list[annotation_file] = annotations_list_input[ 3029 annotation_file 3030 ] 3031 3032 # Check each databases 3033 if len(annotations_list): 3034 3035 log.info( 3036 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3037 ) 3038 3039 for annotation_file in annotations_list: 3040 3041 # Init 3042 annotations = annotations_list.get(annotation_file, None) 3043 3044 # Annotation snpEff 3045 if annotation_file.startswith("snpeff"): 3046 3047 log.debug(f"Quick Annotation snpEff") 3048 3049 if "snpeff" not in param["annotation"]: 3050 param["annotation"]["snpeff"] = {} 3051 3052 if "options" not in param["annotation"]["snpeff"]: 3053 param["annotation"]["snpeff"]["options"] = "" 3054 3055 # snpEff options in annotations 3056 param["annotation"]["snpeff"]["options"] = "".join( 3057 annotation_file.split(":")[1:] 3058 ) 3059 3060 # Annotation Annovar 3061 elif annotation_file.startswith("annovar"): 3062 3063 log.debug(f"Quick Annotation Annovar") 3064 3065 if "annovar" not in param["annotation"]: 3066 param["annotation"]["annovar"] = {} 3067 3068 if "annotations" not in param["annotation"]["annovar"]: 3069 param["annotation"]["annovar"]["annotations"] = {} 3070 3071 # Options 3072 annotation_file_split = annotation_file.split(":") 3073 for annotation_file_annotation in annotation_file_split[1:]: 3074 if annotation_file_annotation: 3075 param["annotation"]["annovar"]["annotations"][ 3076 annotation_file_annotation 3077 ] = annotations 3078 3079 # Annotation Exomiser 3080 elif annotation_file.startswith("exomiser"): 3081 3082 log.debug(f"Quick Annotation Exomiser") 3083 3084 param["annotation"]["exomiser"] = params_string_to_dict( 3085 annotation_file 3086 ) 3087 3088 # Annotation Splice 3089 elif annotation_file.startswith("splice"): 3090 3091 log.debug(f"Quick Annotation Splice") 3092 3093 param["annotation"]["splice"] = params_string_to_dict( 3094 annotation_file 3095 ) 3096 3097 # Annotation Parquet or BCFTOOLS 3098 else: 3099 3100 # Tools detection 3101 if annotation_file.startswith("bcftools:"): 3102 annotation_tool_initial = "bcftools" 3103 annotation_file = ":".join(annotation_file.split(":")[1:]) 3104 elif annotation_file.startswith("snpsift:"): 3105 annotation_tool_initial = "snpsift" 3106 annotation_file = ":".join(annotation_file.split(":")[1:]) 3107 elif annotation_file.startswith("bigwig:"): 3108 annotation_tool_initial = "bigwig" 3109 annotation_file = ":".join(annotation_file.split(":")[1:]) 3110 else: 3111 annotation_tool_initial = None 3112 3113 # list of files 3114 annotation_file_list = annotation_file.replace("+", ":").split( 3115 ":" 3116 ) 3117 3118 for annotation_file in annotation_file_list: 3119 3120 if annotation_file: 3121 3122 # Annotation tool initial 3123 annotation_tool = annotation_tool_initial 3124 3125 # Find file 3126 annotation_file_found = None 3127 3128 if os.path.exists(annotation_file): 3129 annotation_file_found = annotation_file 3130 elif os.path.exists(full_path(annotation_file)): 3131 annotation_file_found = full_path(annotation_file) 3132 else: 3133 # Find within assembly folders 3134 for annotations_database in annotations_databases: 3135 found_files = find_all( 3136 annotation_file, 3137 os.path.join( 3138 annotations_database, assembly 3139 ), 3140 ) 3141 if len(found_files) > 0: 3142 annotation_file_found = found_files[0] 3143 break 3144 if not annotation_file_found and not assembly: 3145 # Find within folders 3146 for ( 3147 annotations_database 3148 ) in annotations_databases: 3149 found_files = find_all( 3150 annotation_file, annotations_database 3151 ) 3152 if len(found_files) > 0: 3153 annotation_file_found = found_files[0] 3154 break 3155 log.debug( 3156 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3157 ) 3158 3159 # Full path 3160 annotation_file_found = full_path(annotation_file_found) 3161 3162 if annotation_file_found: 3163 3164 database = Database(database=annotation_file_found) 3165 quick_annotation_format = database.get_format() 3166 quick_annotation_is_compressed = ( 3167 database.is_compressed() 3168 ) 3169 quick_annotation_is_indexed = os.path.exists( 3170 f"{annotation_file_found}.tbi" 3171 ) 3172 bcftools_preference = False 3173 3174 # Check Annotation Tool 3175 if not annotation_tool: 3176 if ( 3177 bcftools_preference 3178 and quick_annotation_format 3179 in ["vcf", "bed"] 3180 and quick_annotation_is_compressed 3181 and quick_annotation_is_indexed 3182 ): 3183 annotation_tool = "bcftools" 3184 elif quick_annotation_format in [ 3185 "vcf", 3186 "bed", 3187 "tsv", 3188 "tsv", 3189 "csv", 3190 "json", 3191 "tbl", 3192 "parquet", 3193 "duckdb", 3194 ]: 3195 annotation_tool = "parquet" 3196 elif quick_annotation_format in ["bw"]: 3197 annotation_tool = "bigwig" 3198 else: 3199 log.error( 3200 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3201 ) 3202 raise ValueError( 3203 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3204 ) 3205 3206 log.debug( 3207 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3208 ) 3209 3210 # Annotation Tool dispatch 3211 if annotation_tool: 3212 if annotation_tool not in param["annotation"]: 3213 param["annotation"][annotation_tool] = {} 3214 if ( 3215 "annotations" 3216 not in param["annotation"][annotation_tool] 3217 ): 3218 param["annotation"][annotation_tool][ 3219 "annotations" 3220 ] = {} 3221 param["annotation"][annotation_tool][ 3222 "annotations" 3223 ][annotation_file_found] = annotations 3224 3225 else: 3226 log.warning( 3227 f"Quick Annotation File {annotation_file} does NOT exist" 3228 ) 3229 3230 self.set_param(param) 3231 3232 if param.get("annotation", None): 3233 log.info("Annotations") 3234 if param.get("annotation", {}).get("parquet", None): 3235 log.info("Annotations 'parquet'...") 3236 self.annotation_parquet() 3237 if param.get("annotation", {}).get("bcftools", None): 3238 log.info("Annotations 'bcftools'...") 3239 self.annotation_bcftools() 3240 if param.get("annotation", {}).get("snpsift", None): 3241 log.info("Annotations 'snpsift'...") 3242 self.annotation_snpsift() 3243 if param.get("annotation", {}).get("bigwig", None): 3244 log.info("Annotations 'bigwig'...") 3245 self.annotation_bigwig() 3246 if param.get("annotation", {}).get("annovar", None): 3247 log.info("Annotations 'annovar'...") 3248 self.annotation_annovar() 3249 if param.get("annotation", {}).get("snpeff", None): 3250 log.info("Annotations 'snpeff'...") 3251 self.annotation_snpeff() 3252 if param.get("annotation", {}).get("exomiser", None) is not None: 3253 log.info("Annotations 'exomiser'...") 3254 self.annotation_exomiser() 3255 if param.get("annotation", {}).get("splice", None) is not None: 3256 log.info("Annotations 'splice' ...") 3257 self.annotation_splice() 3258 3259 # Explode INFOS fields into table fields 3260 if self.get_explode_infos(): 3261 self.explode_infos( 3262 prefix=self.get_explode_infos_prefix(), 3263 fields=self.get_explode_infos_fields(), 3264 force=True, 3265 ) 3266 3267 def annotation_bigwig(self, threads: int = None) -> None: 3268 """ 3269 The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases. 3270 3271 :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the 3272 number of threads to be used for parallel processing during the annotation process. If the 3273 `threads` parameter is not provided, the method will attempt to determine the optimal number of 3274 threads to use based on the system configuration 3275 :type threads: int 3276 :return: True 3277 """ 3278 3279 # DEBUG 3280 log.debug("Start annotation with bigwig databases") 3281 3282 # # Threads 3283 # if not threads: 3284 # threads = self.get_threads() 3285 # log.debug("Threads: " + str(threads)) 3286 3287 # Config 3288 config = self.get_config() 3289 log.debug("Config: " + str(config)) 3290 3291 # Config - BCFTools databases folders 3292 databases_folders = set( 3293 self.get_config() 3294 .get("folders", {}) 3295 .get("databases", {}) 3296 .get("annotations", ["."]) 3297 + self.get_config() 3298 .get("folders", {}) 3299 .get("databases", {}) 3300 .get("bigwig", ["."]) 3301 ) 3302 log.debug("Databases annotations: " + str(databases_folders)) 3303 3304 # Param 3305 annotations = ( 3306 self.get_param() 3307 .get("annotation", {}) 3308 .get("bigwig", {}) 3309 .get("annotations", None) 3310 ) 3311 log.debug("Annotations: " + str(annotations)) 3312 3313 # Assembly 3314 assembly = self.get_param().get( 3315 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3316 ) 3317 3318 # Data 3319 table_variants = self.get_table_variants() 3320 3321 # Check if not empty 3322 log.debug("Check if not empty") 3323 sql_query_chromosomes = ( 3324 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3325 ) 3326 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3327 if not sql_query_chromosomes_df["count"][0]: 3328 log.info(f"VCF empty") 3329 return 3330 3331 # VCF header 3332 vcf_reader = self.get_header() 3333 log.debug("Initial header: " + str(vcf_reader.infos)) 3334 3335 # Existing annotations 3336 for vcf_annotation in self.get_header().infos: 3337 3338 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3339 log.debug( 3340 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3341 ) 3342 3343 if annotations: 3344 3345 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3346 3347 # Export VCF file 3348 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3349 3350 # annotation_bigwig_config 3351 annotation_bigwig_config_list = [] 3352 3353 for annotation in annotations: 3354 annotation_fields = annotations[annotation] 3355 3356 # Annotation Name 3357 annotation_name = os.path.basename(annotation) 3358 3359 if not annotation_fields: 3360 annotation_fields = {"INFO": None} 3361 3362 log.debug(f"Annotation '{annotation_name}'") 3363 log.debug( 3364 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3365 ) 3366 3367 # Create Database 3368 database = Database( 3369 database=annotation, 3370 databases_folders=databases_folders, 3371 assembly=assembly, 3372 ) 3373 3374 # Find files 3375 db_file = database.get_database() 3376 db_file = full_path(db_file) 3377 db_hdr_file = database.get_header_file() 3378 db_hdr_file = full_path(db_hdr_file) 3379 db_file_type = database.get_format() 3380 3381 # If db_file is http ? 3382 if database.get_database().startswith("http"): 3383 3384 # Datbase is HTTP URL 3385 db_file_is_http = True 3386 3387 # DB file keep as URL 3388 db_file = database.get_database() 3389 log.warning( 3390 f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)" 3391 ) 3392 3393 # Retrieve automatic annotation field name 3394 annotation_field = clean_annotation_field( 3395 os.path.basename(db_file).replace(".bw", "") 3396 ) 3397 log.debug( 3398 f"Create header file with annotation field '{annotation_field}' is an HTTP URL" 3399 ) 3400 3401 # Create automatic header file 3402 db_hdr_file = os.path.join(tmp_dir, "header.hdr") 3403 with open(db_hdr_file, "w") as f: 3404 f.write("##fileformat=VCFv4.2\n") 3405 f.write( 3406 f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""" 3407 ) 3408 f.write(f"#CHROM START END {annotation_field}\n") 3409 3410 else: 3411 3412 # Datbase is NOT HTTP URL 3413 db_file_is_http = False 3414 3415 # Check index - try to create if not exists 3416 if ( 3417 db_file is None 3418 or db_hdr_file is None 3419 or (not os.path.exists(db_file) and not db_file_is_http) 3420 or not os.path.exists(db_hdr_file) 3421 or not db_file_type in ["bw"] 3422 ): 3423 # if False: 3424 log.error("Annotation failed: database not valid") 3425 log.error(f"Annotation annotation file: {db_file}") 3426 log.error(f"Annotation annotation file type: {db_file_type}") 3427 log.error(f"Annotation annotation header: {db_hdr_file}") 3428 raise ValueError( 3429 f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}" 3430 ) 3431 else: 3432 3433 # Log 3434 log.debug( 3435 f"Annotation '{annotation}' - file: " 3436 + str(db_file) 3437 + " and " 3438 + str(db_hdr_file) 3439 ) 3440 3441 # Load header as VCF object 3442 db_hdr_vcf = Variants(input=db_hdr_file) 3443 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3444 log.debug( 3445 "Annotation database header: " 3446 + str(db_hdr_vcf_header_infos) 3447 ) 3448 3449 # For all fields in database 3450 annotation_fields_full = False 3451 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3452 annotation_fields = { 3453 key: key for key in db_hdr_vcf_header_infos 3454 } 3455 log.debug( 3456 "Annotation database header - All annotations added: " 3457 + str(annotation_fields) 3458 ) 3459 annotation_fields_full = True 3460 3461 # Init 3462 cyvcf2_header_rename_dict = {} 3463 cyvcf2_header_list = [] 3464 cyvcf2_header_indexes = {} 3465 3466 # process annotation fields 3467 for annotation_field in annotation_fields: 3468 3469 # New annotation name 3470 annotation_field_new = annotation_fields[annotation_field] 3471 3472 # Check annotation field and index in header 3473 if ( 3474 annotation_field 3475 in db_hdr_vcf.get_header_columns_as_list() 3476 ): 3477 annotation_field_index = ( 3478 db_hdr_vcf.get_header_columns_as_list().index( 3479 annotation_field 3480 ) 3481 - 3 3482 ) 3483 cyvcf2_header_indexes[annotation_field_new] = ( 3484 annotation_field_index 3485 ) 3486 else: 3487 msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'" 3488 log.error(msg_err) 3489 raise ValueError(msg_err) 3490 3491 # Append annotation field in cyvcf2 header list 3492 cyvcf2_header_rename_dict[annotation_field_new] = ( 3493 db_hdr_vcf_header_infos[annotation_field].id 3494 ) 3495 cyvcf2_header_list.append( 3496 { 3497 "ID": annotation_field_new, 3498 "Number": db_hdr_vcf_header_infos[ 3499 annotation_field 3500 ].num, 3501 "Type": db_hdr_vcf_header_infos[ 3502 annotation_field 3503 ].type, 3504 "Description": db_hdr_vcf_header_infos[ 3505 annotation_field 3506 ].desc, 3507 } 3508 ) 3509 3510 # Add header on VCF 3511 vcf_reader.infos[annotation_field_new] = vcf.parser._Info( 3512 annotation_field_new, 3513 db_hdr_vcf_header_infos[annotation_field].num, 3514 db_hdr_vcf_header_infos[annotation_field].type, 3515 db_hdr_vcf_header_infos[annotation_field].desc, 3516 "HOWARD BigWig annotation", 3517 "unknown", 3518 self.code_type_map[ 3519 db_hdr_vcf_header_infos[annotation_field].type 3520 ], 3521 ) 3522 3523 # Load bigwig database 3524 bw_db = pyBigWig.open(db_file) 3525 if bw_db.isBigWig(): 3526 log.debug(f"Database '{db_file}' is in 'BigWig' format") 3527 else: 3528 msg_err = f"Database '{db_file}' is NOT in 'BigWig' format" 3529 log.error(msg_err) 3530 raise ValueError(msg_err) 3531 3532 annotation_bigwig_config_list.append( 3533 { 3534 "db_file": db_file, 3535 "bw_db": bw_db, 3536 "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict, 3537 "cyvcf2_header_list": cyvcf2_header_list, 3538 "cyvcf2_header_indexes": cyvcf2_header_indexes, 3539 } 3540 ) 3541 3542 # Annotate 3543 if annotation_bigwig_config_list: 3544 3545 # Annotation config 3546 log.debug( 3547 f"annotation_bigwig_config={annotation_bigwig_config_list}" 3548 ) 3549 3550 # Export VCF file 3551 self.export_variant_vcf( 3552 vcf_file=tmp_vcf_name, 3553 remove_info=True, 3554 add_samples=False, 3555 index=True, 3556 ) 3557 3558 # Load input tmp file 3559 input_vcf = cyvcf2.VCF(tmp_vcf_name) 3560 3561 # Add header in input file 3562 for annotation_bigwig_config in annotation_bigwig_config_list: 3563 for cyvcf2_header_field in annotation_bigwig_config.get( 3564 "cyvcf2_header_list", [] 3565 ): 3566 log.info( 3567 f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'" 3568 ) 3569 input_vcf.add_info_to_header(cyvcf2_header_field) 3570 3571 # Create output VCF file 3572 output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz") 3573 output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf) 3574 3575 # Fetch variants 3576 log.info(f"Annotations 'bigwig' start...") 3577 for variant in input_vcf: 3578 3579 for annotation_bigwig_config in annotation_bigwig_config_list: 3580 3581 # DB and indexes 3582 bw_db = annotation_bigwig_config.get("bw_db", None) 3583 cyvcf2_header_indexes = annotation_bigwig_config.get( 3584 "cyvcf2_header_indexes", None 3585 ) 3586 3587 # Retrieve value from chrom pos 3588 res = bw_db.values( 3589 variant.CHROM, variant.POS - 1, variant.POS 3590 ) 3591 3592 # For each annotation fields (and indexes) 3593 for cyvcf2_header_index in cyvcf2_header_indexes: 3594 3595 # If value is NOT nNone 3596 if not np.isnan( 3597 res[cyvcf2_header_indexes[cyvcf2_header_index]] 3598 ): 3599 variant.INFO[cyvcf2_header_index] = res[ 3600 cyvcf2_header_indexes[cyvcf2_header_index] 3601 ] 3602 3603 # Add record in output file 3604 output_vcf.write_record(variant) 3605 3606 # Log 3607 log.debug(f"Annotation done.") 3608 3609 # Close and write file 3610 log.info(f"Annotations 'bigwig' write...") 3611 output_vcf.close() 3612 log.debug(f"Write done.") 3613 3614 # Update variants 3615 log.info(f"Annotations 'bigwig' update...") 3616 self.update_from_vcf(output_vcf_file) 3617 log.debug(f"Update done.") 3618 3619 return True 3620 3621 def annotation_snpsift(self, threads: int = None) -> None: 3622 """ 3623 This function annotate with bcftools 3624 3625 :param threads: Number of threads to use 3626 :return: the value of the variable "return_value". 3627 """ 3628 3629 # DEBUG 3630 log.debug("Start annotation with bcftools databases") 3631 3632 # Threads 3633 if not threads: 3634 threads = self.get_threads() 3635 log.debug("Threads: " + str(threads)) 3636 3637 # Config 3638 config = self.get_config() 3639 log.debug("Config: " + str(config)) 3640 3641 # Config - snpSift 3642 snpsift_bin_command = get_bin_command( 3643 bin="SnpSift.jar", 3644 tool="snpsift", 3645 bin_type="jar", 3646 config=config, 3647 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3648 ) 3649 if not snpsift_bin_command: 3650 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3651 log.error(msg_err) 3652 raise ValueError(msg_err) 3653 3654 # Config - bcftools 3655 bcftools_bin_command = get_bin_command( 3656 bin="bcftools", 3657 tool="bcftools", 3658 bin_type="bin", 3659 config=config, 3660 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3661 ) 3662 if not bcftools_bin_command: 3663 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3664 log.error(msg_err) 3665 raise ValueError(msg_err) 3666 3667 # Config - BCFTools databases folders 3668 databases_folders = set( 3669 self.get_config() 3670 .get("folders", {}) 3671 .get("databases", {}) 3672 .get("annotations", ["."]) 3673 + self.get_config() 3674 .get("folders", {}) 3675 .get("databases", {}) 3676 .get("bcftools", ["."]) 3677 ) 3678 log.debug("Databases annotations: " + str(databases_folders)) 3679 3680 # Param 3681 annotations = ( 3682 self.get_param() 3683 .get("annotation", {}) 3684 .get("snpsift", {}) 3685 .get("annotations", None) 3686 ) 3687 log.debug("Annotations: " + str(annotations)) 3688 3689 # Assembly 3690 assembly = self.get_param().get( 3691 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3692 ) 3693 3694 # Data 3695 table_variants = self.get_table_variants() 3696 3697 # Check if not empty 3698 log.debug("Check if not empty") 3699 sql_query_chromosomes = ( 3700 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3701 ) 3702 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3703 if not sql_query_chromosomes_df["count"][0]: 3704 log.info(f"VCF empty") 3705 return 3706 3707 # VCF header 3708 vcf_reader = self.get_header() 3709 log.debug("Initial header: " + str(vcf_reader.infos)) 3710 3711 # Existing annotations 3712 for vcf_annotation in self.get_header().infos: 3713 3714 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3715 log.debug( 3716 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3717 ) 3718 3719 if annotations: 3720 3721 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3722 3723 # Export VCF file 3724 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3725 3726 # Init 3727 commands = {} 3728 3729 for annotation in annotations: 3730 annotation_fields = annotations[annotation] 3731 3732 # Annotation Name 3733 annotation_name = os.path.basename(annotation) 3734 3735 if not annotation_fields: 3736 annotation_fields = {"INFO": None} 3737 3738 log.debug(f"Annotation '{annotation_name}'") 3739 log.debug( 3740 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3741 ) 3742 3743 # Create Database 3744 database = Database( 3745 database=annotation, 3746 databases_folders=databases_folders, 3747 assembly=assembly, 3748 ) 3749 3750 # Find files 3751 db_file = database.get_database() 3752 db_file = full_path(db_file) 3753 db_hdr_file = database.get_header_file() 3754 db_hdr_file = full_path(db_hdr_file) 3755 db_file_type = database.get_format() 3756 db_tbi_file = f"{db_file}.tbi" 3757 db_file_compressed = database.is_compressed() 3758 3759 # Check if compressed 3760 if not db_file_compressed: 3761 log.error( 3762 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3763 ) 3764 raise ValueError( 3765 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3766 ) 3767 3768 # Check if indexed 3769 if not os.path.exists(db_tbi_file): 3770 log.error( 3771 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3772 ) 3773 raise ValueError( 3774 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3775 ) 3776 3777 # Check index - try to create if not exists 3778 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3779 log.error("Annotation failed: database not valid") 3780 log.error(f"Annotation annotation file: {db_file}") 3781 log.error(f"Annotation annotation header: {db_hdr_file}") 3782 log.error(f"Annotation annotation index: {db_tbi_file}") 3783 raise ValueError( 3784 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3785 ) 3786 else: 3787 3788 log.debug( 3789 f"Annotation '{annotation}' - file: " 3790 + str(db_file) 3791 + " and " 3792 + str(db_hdr_file) 3793 ) 3794 3795 # Load header as VCF object 3796 db_hdr_vcf = Variants(input=db_hdr_file) 3797 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3798 log.debug( 3799 "Annotation database header: " 3800 + str(db_hdr_vcf_header_infos) 3801 ) 3802 3803 # For all fields in database 3804 annotation_fields_full = False 3805 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3806 annotation_fields = { 3807 key: key for key in db_hdr_vcf_header_infos 3808 } 3809 log.debug( 3810 "Annotation database header - All annotations added: " 3811 + str(annotation_fields) 3812 ) 3813 annotation_fields_full = True 3814 3815 # # Create file for field rename 3816 # log.debug("Create file for field rename") 3817 # tmp_rename = NamedTemporaryFile( 3818 # prefix=self.get_prefix(), 3819 # dir=self.get_tmp_dir(), 3820 # suffix=".rename", 3821 # delete=False, 3822 # ) 3823 # tmp_rename_name = tmp_rename.name 3824 # tmp_files.append(tmp_rename_name) 3825 3826 # Number of fields 3827 nb_annotation_field = 0 3828 annotation_list = [] 3829 annotation_infos_rename_list = [] 3830 3831 for annotation_field in annotation_fields: 3832 3833 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3834 annotation_fields_new_name = annotation_fields.get( 3835 annotation_field, annotation_field 3836 ) 3837 if not annotation_fields_new_name: 3838 annotation_fields_new_name = annotation_field 3839 3840 # Check if field is in DB and if field is not elready in input data 3841 if ( 3842 annotation_field in db_hdr_vcf.get_header().infos 3843 and annotation_fields_new_name 3844 not in self.get_header().infos 3845 ): 3846 3847 log.info( 3848 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3849 ) 3850 3851 # BCFTools annotate param to rename fields 3852 if annotation_field != annotation_fields_new_name: 3853 annotation_infos_rename_list.append( 3854 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3855 ) 3856 3857 # Add INFO field to header 3858 db_hdr_vcf_header_infos_number = ( 3859 db_hdr_vcf_header_infos[annotation_field].num or "." 3860 ) 3861 db_hdr_vcf_header_infos_type = ( 3862 db_hdr_vcf_header_infos[annotation_field].type 3863 or "String" 3864 ) 3865 db_hdr_vcf_header_infos_description = ( 3866 db_hdr_vcf_header_infos[annotation_field].desc 3867 or f"{annotation_field} description" 3868 ) 3869 db_hdr_vcf_header_infos_source = ( 3870 db_hdr_vcf_header_infos[annotation_field].source 3871 or "unknown" 3872 ) 3873 db_hdr_vcf_header_infos_version = ( 3874 db_hdr_vcf_header_infos[annotation_field].version 3875 or "unknown" 3876 ) 3877 3878 vcf_reader.infos[annotation_fields_new_name] = ( 3879 vcf.parser._Info( 3880 annotation_fields_new_name, 3881 db_hdr_vcf_header_infos_number, 3882 db_hdr_vcf_header_infos_type, 3883 db_hdr_vcf_header_infos_description, 3884 db_hdr_vcf_header_infos_source, 3885 db_hdr_vcf_header_infos_version, 3886 self.code_type_map[ 3887 db_hdr_vcf_header_infos_type 3888 ], 3889 ) 3890 ) 3891 3892 annotation_list.append(annotation_field) 3893 3894 nb_annotation_field += 1 3895 3896 else: 3897 3898 if ( 3899 annotation_field 3900 not in db_hdr_vcf.get_header().infos 3901 ): 3902 log.warning( 3903 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3904 ) 3905 if ( 3906 annotation_fields_new_name 3907 in self.get_header().infos 3908 ): 3909 log.warning( 3910 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3911 ) 3912 3913 log.info( 3914 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3915 ) 3916 3917 annotation_infos = ",".join(annotation_list) 3918 3919 if annotation_infos != "": 3920 3921 # Annotated VCF (and error file) 3922 tmp_annotation_vcf_name = os.path.join( 3923 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3924 ) 3925 tmp_annotation_vcf_name_err = ( 3926 tmp_annotation_vcf_name + ".err" 3927 ) 3928 3929 # Add fields to annotate 3930 if not annotation_fields_full: 3931 annotation_infos_option = f"-info {annotation_infos}" 3932 else: 3933 annotation_infos_option = "" 3934 3935 # Info fields rename 3936 if annotation_infos_rename_list: 3937 annotation_infos_rename = " -c " + ",".join( 3938 annotation_infos_rename_list 3939 ) 3940 else: 3941 annotation_infos_rename = "" 3942 3943 # Annotate command 3944 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3945 3946 # Add command 3947 commands[command_annotate] = tmp_annotation_vcf_name 3948 3949 if commands: 3950 3951 # Export VCF file 3952 self.export_variant_vcf( 3953 vcf_file=tmp_vcf_name, 3954 remove_info=True, 3955 add_samples=False, 3956 index=True, 3957 ) 3958 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3959 3960 # Num command 3961 nb_command = 0 3962 3963 # Annotate 3964 for command_annotate in commands: 3965 nb_command += 1 3966 log.info( 3967 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3968 ) 3969 log.debug(f"command_annotate={command_annotate}") 3970 run_parallel_commands([command_annotate], threads) 3971 3972 # Debug 3973 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3974 3975 # Update variants 3976 log.info( 3977 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3978 ) 3979 self.update_from_vcf(commands[command_annotate]) 3980 3981 def annotation_bcftools(self, threads: int = None) -> None: 3982 """ 3983 This function annotate with bcftools 3984 3985 :param threads: Number of threads to use 3986 :return: the value of the variable "return_value". 3987 """ 3988 3989 # DEBUG 3990 log.debug("Start annotation with bcftools databases") 3991 3992 # Threads 3993 if not threads: 3994 threads = self.get_threads() 3995 log.debug("Threads: " + str(threads)) 3996 3997 # Config 3998 config = self.get_config() 3999 log.debug("Config: " + str(config)) 4000 4001 # DEBUG 4002 delete_tmp = True 4003 if self.get_config().get("verbosity", "warning") in ["debug"]: 4004 delete_tmp = False 4005 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4006 4007 # Config - BCFTools bin command 4008 bcftools_bin_command = get_bin_command( 4009 bin="bcftools", 4010 tool="bcftools", 4011 bin_type="bin", 4012 config=config, 4013 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 4014 ) 4015 if not bcftools_bin_command: 4016 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 4017 log.error(msg_err) 4018 raise ValueError(msg_err) 4019 4020 # Config - BCFTools databases folders 4021 databases_folders = set( 4022 self.get_config() 4023 .get("folders", {}) 4024 .get("databases", {}) 4025 .get("annotations", ["."]) 4026 + self.get_config() 4027 .get("folders", {}) 4028 .get("databases", {}) 4029 .get("bcftools", ["."]) 4030 ) 4031 log.debug("Databases annotations: " + str(databases_folders)) 4032 4033 # Param 4034 annotations = ( 4035 self.get_param() 4036 .get("annotation", {}) 4037 .get("bcftools", {}) 4038 .get("annotations", None) 4039 ) 4040 log.debug("Annotations: " + str(annotations)) 4041 4042 # Assembly 4043 assembly = self.get_param().get( 4044 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 4045 ) 4046 4047 # Data 4048 table_variants = self.get_table_variants() 4049 4050 # Check if not empty 4051 log.debug("Check if not empty") 4052 sql_query_chromosomes = ( 4053 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4054 ) 4055 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 4056 if not sql_query_chromosomes_df["count"][0]: 4057 log.info(f"VCF empty") 4058 return 4059 4060 # Export in VCF 4061 log.debug("Create initial file to annotate") 4062 tmp_vcf = NamedTemporaryFile( 4063 prefix=self.get_prefix(), 4064 dir=self.get_tmp_dir(), 4065 suffix=".vcf.gz", 4066 delete=False, 4067 ) 4068 tmp_vcf_name = tmp_vcf.name 4069 4070 # VCF header 4071 vcf_reader = self.get_header() 4072 log.debug("Initial header: " + str(vcf_reader.infos)) 4073 4074 # Existing annotations 4075 for vcf_annotation in self.get_header().infos: 4076 4077 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4078 log.debug( 4079 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4080 ) 4081 4082 if annotations: 4083 4084 tmp_ann_vcf_list = [] 4085 commands = [] 4086 tmp_files = [] 4087 err_files = [] 4088 4089 for annotation in annotations: 4090 annotation_fields = annotations[annotation] 4091 4092 # Annotation Name 4093 annotation_name = os.path.basename(annotation) 4094 4095 if not annotation_fields: 4096 annotation_fields = {"INFO": None} 4097 4098 log.debug(f"Annotation '{annotation_name}'") 4099 log.debug( 4100 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 4101 ) 4102 4103 # Create Database 4104 database = Database( 4105 database=annotation, 4106 databases_folders=databases_folders, 4107 assembly=assembly, 4108 ) 4109 4110 # Find files 4111 db_file = database.get_database() 4112 db_file = full_path(db_file) 4113 db_hdr_file = database.get_header_file() 4114 db_hdr_file = full_path(db_hdr_file) 4115 db_file_type = database.get_format() 4116 db_tbi_file = f"{db_file}.tbi" 4117 db_file_compressed = database.is_compressed() 4118 4119 # Check if compressed 4120 if not db_file_compressed: 4121 log.error( 4122 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4123 ) 4124 raise ValueError( 4125 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4126 ) 4127 4128 # Check if indexed 4129 if not os.path.exists(db_tbi_file): 4130 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 4131 raise ValueError( 4132 f"Annotation '{annotation}' - {db_file} NOT indexed file" 4133 ) 4134 4135 # Check index - try to create if not exists 4136 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 4137 log.error("Annotation failed: database not valid") 4138 log.error(f"Annotation annotation file: {db_file}") 4139 log.error(f"Annotation annotation header: {db_hdr_file}") 4140 log.error(f"Annotation annotation index: {db_tbi_file}") 4141 raise ValueError( 4142 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 4143 ) 4144 else: 4145 4146 log.debug( 4147 f"Annotation '{annotation}' - file: " 4148 + str(db_file) 4149 + " and " 4150 + str(db_hdr_file) 4151 ) 4152 4153 # Load header as VCF object 4154 db_hdr_vcf = Variants(input=db_hdr_file) 4155 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 4156 log.debug( 4157 "Annotation database header: " + str(db_hdr_vcf_header_infos) 4158 ) 4159 4160 # For all fields in database 4161 if "ALL" in annotation_fields or "INFO" in annotation_fields: 4162 annotation_fields = { 4163 key: key for key in db_hdr_vcf_header_infos 4164 } 4165 log.debug( 4166 "Annotation database header - All annotations added: " 4167 + str(annotation_fields) 4168 ) 4169 4170 # Number of fields 4171 nb_annotation_field = 0 4172 annotation_list = [] 4173 4174 for annotation_field in annotation_fields: 4175 4176 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 4177 annotation_fields_new_name = annotation_fields.get( 4178 annotation_field, annotation_field 4179 ) 4180 if not annotation_fields_new_name: 4181 annotation_fields_new_name = annotation_field 4182 4183 # Check if field is in DB and if field is not elready in input data 4184 if ( 4185 annotation_field in db_hdr_vcf.get_header().infos 4186 and annotation_fields_new_name 4187 not in self.get_header().infos 4188 ): 4189 4190 log.info( 4191 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 4192 ) 4193 4194 # Add INFO field to header 4195 db_hdr_vcf_header_infos_number = ( 4196 db_hdr_vcf_header_infos[annotation_field].num or "." 4197 ) 4198 db_hdr_vcf_header_infos_type = ( 4199 db_hdr_vcf_header_infos[annotation_field].type 4200 or "String" 4201 ) 4202 db_hdr_vcf_header_infos_description = ( 4203 db_hdr_vcf_header_infos[annotation_field].desc 4204 or f"{annotation_field} description" 4205 ) 4206 db_hdr_vcf_header_infos_source = ( 4207 db_hdr_vcf_header_infos[annotation_field].source 4208 or "unknown" 4209 ) 4210 db_hdr_vcf_header_infos_version = ( 4211 db_hdr_vcf_header_infos[annotation_field].version 4212 or "unknown" 4213 ) 4214 4215 vcf_reader.infos[annotation_fields_new_name] = ( 4216 vcf.parser._Info( 4217 annotation_fields_new_name, 4218 db_hdr_vcf_header_infos_number, 4219 db_hdr_vcf_header_infos_type, 4220 db_hdr_vcf_header_infos_description, 4221 db_hdr_vcf_header_infos_source, 4222 db_hdr_vcf_header_infos_version, 4223 self.code_type_map[db_hdr_vcf_header_infos_type], 4224 ) 4225 ) 4226 4227 # annotation_list.append(annotation_field) 4228 if annotation_field != annotation_fields_new_name: 4229 annotation_list.append( 4230 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 4231 ) 4232 else: 4233 annotation_list.append(annotation_field) 4234 4235 nb_annotation_field += 1 4236 4237 else: 4238 4239 if annotation_field not in db_hdr_vcf.get_header().infos: 4240 log.warning( 4241 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 4242 ) 4243 if annotation_fields_new_name in self.get_header().infos: 4244 log.warning( 4245 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 4246 ) 4247 4248 log.info( 4249 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 4250 ) 4251 4252 annotation_infos = ",".join(annotation_list) 4253 4254 if annotation_infos != "": 4255 4256 # Protect header for bcftools (remove "#CHROM" and variants line) 4257 log.debug("Protect Header file - remove #CHROM line if exists") 4258 tmp_header_vcf = NamedTemporaryFile( 4259 prefix=self.get_prefix(), 4260 dir=self.get_tmp_dir(), 4261 suffix=".hdr", 4262 delete=False, 4263 ) 4264 tmp_header_vcf_name = tmp_header_vcf.name 4265 tmp_files.append(tmp_header_vcf_name) 4266 # Command 4267 if db_hdr_file.endswith(".gz"): 4268 command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4269 else: 4270 command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4271 # Run 4272 run_parallel_commands([command_extract_header], 1) 4273 4274 # Find chomosomes 4275 log.debug("Find chromosomes ") 4276 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 4277 sql_query_chromosomes_df = self.get_query_to_df( 4278 sql_query_chromosomes 4279 ) 4280 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 4281 4282 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 4283 4284 # BED columns in the annotation file 4285 if db_file_type in ["bed"]: 4286 annotation_infos = "CHROM,POS,POS," + annotation_infos 4287 4288 for chrom in chomosomes_list: 4289 4290 # Create BED on initial VCF 4291 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 4292 tmp_bed = NamedTemporaryFile( 4293 prefix=self.get_prefix(), 4294 dir=self.get_tmp_dir(), 4295 suffix=".bed", 4296 delete=False, 4297 ) 4298 tmp_bed_name = tmp_bed.name 4299 tmp_files.append(tmp_bed_name) 4300 4301 # Detecte regions 4302 log.debug( 4303 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 4304 ) 4305 window = 1000000 4306 sql_query_intervals_for_bed = f""" 4307 SELECT \"#CHROM\", 4308 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 4309 \"POS\"+{window} 4310 FROM {table_variants} as table_variants 4311 WHERE table_variants.\"#CHROM\" = '{chrom}' 4312 """ 4313 regions = self.conn.execute( 4314 sql_query_intervals_for_bed 4315 ).fetchall() 4316 merged_regions = merge_regions(regions) 4317 log.debug( 4318 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 4319 ) 4320 4321 header = ["#CHROM", "START", "END"] 4322 with open(tmp_bed_name, "w") as f: 4323 # Write the header with tab delimiter 4324 f.write("\t".join(header) + "\n") 4325 for d in merged_regions: 4326 # Write each data row with tab delimiter 4327 f.write("\t".join(map(str, d)) + "\n") 4328 4329 # Tmp files 4330 tmp_annotation_vcf = NamedTemporaryFile( 4331 prefix=self.get_prefix(), 4332 dir=self.get_tmp_dir(), 4333 suffix=".vcf.gz", 4334 delete=False, 4335 ) 4336 tmp_annotation_vcf_name = tmp_annotation_vcf.name 4337 tmp_files.append(tmp_annotation_vcf_name) 4338 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 4339 tmp_annotation_vcf_name_err = ( 4340 tmp_annotation_vcf_name + ".err" 4341 ) 4342 err_files.append(tmp_annotation_vcf_name_err) 4343 4344 # Annotate Command 4345 log.debug( 4346 f"Annotation '{annotation}' - add bcftools command" 4347 ) 4348 4349 # Command 4350 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 4351 4352 # Add command 4353 commands.append(command_annotate) 4354 4355 # if some commands 4356 if commands: 4357 4358 # Export VCF file 4359 self.export_variant_vcf( 4360 vcf_file=tmp_vcf_name, 4361 remove_info=True, 4362 add_samples=False, 4363 index=True, 4364 ) 4365 4366 # Threads 4367 # calculate threads for annotated commands 4368 if commands: 4369 threads_bcftools_annotate = round(threads / len(commands)) 4370 else: 4371 threads_bcftools_annotate = 1 4372 4373 if not threads_bcftools_annotate: 4374 threads_bcftools_annotate = 1 4375 4376 # Add threads option to bcftools commands 4377 if threads_bcftools_annotate > 1: 4378 commands_threaded = [] 4379 for command in commands: 4380 commands_threaded.append( 4381 command.replace( 4382 f"{bcftools_bin_command} annotate ", 4383 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 4384 ) 4385 ) 4386 commands = commands_threaded 4387 4388 # Command annotation multithreading 4389 log.debug(f"Annotation - Annotation commands: " + str(commands)) 4390 log.info( 4391 f"Annotation - Annotation multithreaded in " 4392 + str(len(commands)) 4393 + " commands" 4394 ) 4395 4396 run_parallel_commands(commands, threads) 4397 4398 # Merge 4399 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4400 4401 if tmp_ann_vcf_list_cmd: 4402 4403 # Tmp file 4404 tmp_annotate_vcf = NamedTemporaryFile( 4405 prefix=self.get_prefix(), 4406 dir=self.get_tmp_dir(), 4407 suffix=".vcf.gz", 4408 delete=True, 4409 ) 4410 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4411 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4412 err_files.append(tmp_annotate_vcf_name_err) 4413 4414 # Tmp file remove command 4415 tmp_files_remove_command = "" 4416 if tmp_files: 4417 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4418 4419 # Command merge 4420 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4421 log.info( 4422 f"Annotation - Annotation merging " 4423 + str(len(commands)) 4424 + " annotated files" 4425 ) 4426 log.debug(f"Annotation - merge command: {merge_command}") 4427 run_parallel_commands([merge_command], 1) 4428 4429 # Error messages 4430 log.info(f"Error/Warning messages:") 4431 error_message_command_all = [] 4432 error_message_command_warning = [] 4433 error_message_command_err = [] 4434 for err_file in err_files: 4435 with open(err_file, "r") as f: 4436 for line in f: 4437 message = line.strip() 4438 error_message_command_all.append(message) 4439 if line.startswith("[W::"): 4440 error_message_command_warning.append(message) 4441 if line.startswith("[E::"): 4442 error_message_command_err.append( 4443 f"{err_file}: " + message 4444 ) 4445 # log info 4446 for message in list( 4447 set(error_message_command_err + error_message_command_warning) 4448 ): 4449 log.info(f" {message}") 4450 # debug info 4451 for message in list(set(error_message_command_all)): 4452 log.debug(f" {message}") 4453 # failed 4454 if len(error_message_command_err): 4455 log.error("Annotation failed: Error in commands") 4456 raise ValueError("Annotation failed: Error in commands") 4457 4458 # Update variants 4459 log.info(f"Annotation - Updating...") 4460 self.update_from_vcf(tmp_annotate_vcf_name) 4461 4462 def annotation_exomiser(self, threads: int = None) -> None: 4463 """ 4464 This function annotate with Exomiser 4465 4466 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4467 - "analysis" (dict/file): 4468 Full analysis dictionnary parameters (see Exomiser docs). 4469 Either a dict, or a file in JSON or YAML format. 4470 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4471 Default : None 4472 - "preset" (string): 4473 Analysis preset (available in config folder). 4474 Used if no full "analysis" is provided. 4475 Default: "exome" 4476 - "phenopacket" (dict/file): 4477 Samples and phenotipic features parameters (see Exomiser docs). 4478 Either a dict, or a file in JSON or YAML format. 4479 Default: None 4480 - "subject" (dict): 4481 Sample parameters (see Exomiser docs). 4482 Example: 4483 "subject": 4484 { 4485 "id": "ISDBM322017", 4486 "sex": "FEMALE" 4487 } 4488 Default: None 4489 - "sample" (string): 4490 Sample name to construct "subject" section: 4491 "subject": 4492 { 4493 "id": "<sample>", 4494 "sex": "UNKNOWN_SEX" 4495 } 4496 Default: None 4497 - "phenotypicFeatures" (dict) 4498 Phenotypic features to construct "subject" section. 4499 Example: 4500 "phenotypicFeatures": 4501 [ 4502 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4503 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4504 ] 4505 - "hpo" (list) 4506 List of HPO ids as phenotypic features. 4507 Example: 4508 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4509 Default: [] 4510 - "outputOptions" (dict): 4511 Output options (see Exomiser docs). 4512 Default: 4513 "output_options" = 4514 { 4515 "outputContributingVariantsOnly": False, 4516 "numGenes": 0, 4517 "outputFormats": ["TSV_VARIANT", "VCF"] 4518 } 4519 - "transcript_source" (string): 4520 Transcript source (either "refseq", "ucsc", "ensembl") 4521 Default: "refseq" 4522 - "exomiser_to_info" (boolean): 4523 Add exomiser TSV file columns as INFO fields in VCF. 4524 Default: False 4525 - "release" (string): 4526 Exomise database release. 4527 If not exists, database release will be downloaded (take a while). 4528 Default: None (provided by application.properties configuration file) 4529 - "exomiser_application_properties" (file): 4530 Exomiser configuration file (see Exomiser docs). 4531 Useful to automatically download databases (especially for specific genome databases). 4532 4533 Notes: 4534 - If no sample in parameters, first sample in VCF will be chosen 4535 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4536 4537 :param threads: The number of threads to use 4538 :return: None. 4539 """ 4540 4541 # DEBUG 4542 log.debug("Start annotation with Exomiser databases") 4543 4544 # Threads 4545 if not threads: 4546 threads = self.get_threads() 4547 log.debug("Threads: " + str(threads)) 4548 4549 # Config 4550 config = self.get_config() 4551 log.debug("Config: " + str(config)) 4552 4553 # Config - Folders - Databases 4554 databases_folders = ( 4555 config.get("folders", {}) 4556 .get("databases", {}) 4557 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4558 ) 4559 databases_folders = full_path(databases_folders) 4560 if not os.path.exists(databases_folders): 4561 log.error(f"Databases annotations: {databases_folders} NOT found") 4562 log.debug("Databases annotations: " + str(databases_folders)) 4563 4564 # Config - Exomiser 4565 exomiser_bin_command = get_bin_command( 4566 bin="exomiser-cli*.jar", 4567 tool="exomiser", 4568 bin_type="jar", 4569 config=config, 4570 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4571 ) 4572 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4573 if not exomiser_bin_command: 4574 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4575 log.error(msg_err) 4576 raise ValueError(msg_err) 4577 4578 # Param 4579 param = self.get_param() 4580 log.debug("Param: " + str(param)) 4581 4582 # Param - Exomiser 4583 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4584 log.debug(f"Param Exomiser: {param_exomiser}") 4585 4586 # Param - Assembly 4587 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4588 log.debug("Assembly: " + str(assembly)) 4589 4590 # Data 4591 table_variants = self.get_table_variants() 4592 4593 # Check if not empty 4594 log.debug("Check if not empty") 4595 sql_query_chromosomes = ( 4596 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4597 ) 4598 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4599 log.info(f"VCF empty") 4600 return False 4601 4602 # VCF header 4603 vcf_reader = self.get_header() 4604 log.debug("Initial header: " + str(vcf_reader.infos)) 4605 4606 # Samples 4607 samples = self.get_header_sample_list() 4608 if not samples: 4609 log.error("No Samples in VCF") 4610 return False 4611 log.debug(f"Samples: {samples}") 4612 4613 # Memory limit 4614 memory_limit = self.get_memory("8G") 4615 log.debug(f"memory_limit: {memory_limit}") 4616 4617 # Exomiser java options 4618 exomiser_java_options = ( 4619 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4620 ) 4621 log.debug(f"Exomiser java options: {exomiser_java_options}") 4622 4623 # Download Exomiser (if not exists) 4624 exomiser_release = param_exomiser.get("release", None) 4625 exomiser_application_properties = param_exomiser.get( 4626 "exomiser_application_properties", None 4627 ) 4628 databases_download_exomiser( 4629 assemblies=[assembly], 4630 exomiser_folder=databases_folders, 4631 exomiser_release=exomiser_release, 4632 exomiser_phenotype_release=exomiser_release, 4633 exomiser_application_properties=exomiser_application_properties, 4634 ) 4635 4636 # Force annotation 4637 force_update_annotation = True 4638 4639 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4640 log.debug("Start annotation Exomiser") 4641 4642 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4643 4644 # tmp_dir = "/tmp/exomiser" 4645 4646 ### ANALYSIS ### 4647 ################ 4648 4649 # Create analysis.json through analysis dict 4650 # either analysis in param or by default 4651 # depending on preset exome/genome) 4652 4653 # Init analysis dict 4654 param_exomiser_analysis_dict = {} 4655 4656 # analysis from param 4657 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4658 param_exomiser_analysis = full_path(param_exomiser_analysis) 4659 4660 # If analysis in param -> load anlaysis json 4661 if param_exomiser_analysis: 4662 4663 # If param analysis is a file and exists 4664 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4665 param_exomiser_analysis 4666 ): 4667 # Load analysis file into analysis dict (either yaml or json) 4668 with open(param_exomiser_analysis) as json_file: 4669 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4670 4671 # If param analysis is a dict 4672 elif isinstance(param_exomiser_analysis, dict): 4673 # Load analysis dict into analysis dict (either yaml or json) 4674 param_exomiser_analysis_dict = param_exomiser_analysis 4675 4676 # Error analysis type 4677 else: 4678 log.error(f"Analysis type unknown. Check param file.") 4679 raise ValueError(f"Analysis type unknown. Check param file.") 4680 4681 # Case no input analysis config file/dict 4682 # Use preset (exome/genome) to open default config file 4683 if not param_exomiser_analysis_dict: 4684 4685 # default preset 4686 default_preset = "exome" 4687 4688 # Get param preset or default preset 4689 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4690 4691 # Try to find if preset is a file 4692 if os.path.exists(param_exomiser_preset): 4693 # Preset file is provided in full path 4694 param_exomiser_analysis_default_config_file = ( 4695 param_exomiser_preset 4696 ) 4697 # elif os.path.exists(full_path(param_exomiser_preset)): 4698 # # Preset file is provided in full path 4699 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4700 elif os.path.exists( 4701 os.path.join(folder_config, param_exomiser_preset) 4702 ): 4703 # Preset file is provided a basename in config folder (can be a path with subfolders) 4704 param_exomiser_analysis_default_config_file = os.path.join( 4705 folder_config, param_exomiser_preset 4706 ) 4707 else: 4708 # Construct preset file 4709 param_exomiser_analysis_default_config_file = os.path.join( 4710 folder_config, 4711 f"preset-{param_exomiser_preset}-analysis.json", 4712 ) 4713 4714 # If preset file exists 4715 param_exomiser_analysis_default_config_file = full_path( 4716 param_exomiser_analysis_default_config_file 4717 ) 4718 if os.path.exists(param_exomiser_analysis_default_config_file): 4719 # Load prest file into analysis dict (either yaml or json) 4720 with open( 4721 param_exomiser_analysis_default_config_file 4722 ) as json_file: 4723 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4724 json_file 4725 ) 4726 4727 # Error preset file 4728 else: 4729 log.error( 4730 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4731 ) 4732 raise ValueError( 4733 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4734 ) 4735 4736 # If no analysis dict created 4737 if not param_exomiser_analysis_dict: 4738 log.error(f"No analysis config") 4739 raise ValueError(f"No analysis config") 4740 4741 # Log 4742 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4743 4744 ### PHENOPACKET ### 4745 ################### 4746 4747 # If no PhenoPacket in analysis dict -> check in param 4748 if "phenopacket" not in param_exomiser_analysis_dict: 4749 4750 # If PhenoPacket in param -> load anlaysis json 4751 if param_exomiser.get("phenopacket", None): 4752 4753 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4754 param_exomiser_phenopacket = full_path( 4755 param_exomiser_phenopacket 4756 ) 4757 4758 # If param phenopacket is a file and exists 4759 if isinstance( 4760 param_exomiser_phenopacket, str 4761 ) and os.path.exists(param_exomiser_phenopacket): 4762 # Load phenopacket file into analysis dict (either yaml or json) 4763 with open(param_exomiser_phenopacket) as json_file: 4764 param_exomiser_analysis_dict["phenopacket"] = ( 4765 yaml.safe_load(json_file) 4766 ) 4767 4768 # If param phenopacket is a dict 4769 elif isinstance(param_exomiser_phenopacket, dict): 4770 # Load phenopacket dict into analysis dict (either yaml or json) 4771 param_exomiser_analysis_dict["phenopacket"] = ( 4772 param_exomiser_phenopacket 4773 ) 4774 4775 # Error phenopacket type 4776 else: 4777 log.error(f"Phenopacket type unknown. Check param file.") 4778 raise ValueError( 4779 f"Phenopacket type unknown. Check param file." 4780 ) 4781 4782 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4783 if "phenopacket" not in param_exomiser_analysis_dict: 4784 4785 # Init PhenoPacket 4786 param_exomiser_analysis_dict["phenopacket"] = { 4787 "id": "analysis", 4788 "proband": {}, 4789 } 4790 4791 ### Add subject ### 4792 4793 # If subject exists 4794 param_exomiser_subject = param_exomiser.get("subject", {}) 4795 4796 # If subject not exists -> found sample ID 4797 if not param_exomiser_subject: 4798 4799 # Found sample ID in param 4800 sample = param_exomiser.get("sample", None) 4801 4802 # Find sample ID (first sample) 4803 if not sample: 4804 sample_list = self.get_header_sample_list() 4805 if len(sample_list) > 0: 4806 sample = sample_list[0] 4807 else: 4808 log.error(f"No sample found") 4809 raise ValueError(f"No sample found") 4810 4811 # Create subject 4812 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4813 4814 # Add to dict 4815 param_exomiser_analysis_dict["phenopacket"][ 4816 "subject" 4817 ] = param_exomiser_subject 4818 4819 ### Add "phenotypicFeatures" ### 4820 4821 # If phenotypicFeatures exists 4822 param_exomiser_phenotypicfeatures = param_exomiser.get( 4823 "phenotypicFeatures", [] 4824 ) 4825 4826 # If phenotypicFeatures not exists -> Try to infer from hpo list 4827 if not param_exomiser_phenotypicfeatures: 4828 4829 # Found HPO in param 4830 param_exomiser_hpo = param_exomiser.get("hpo", []) 4831 4832 # Split HPO if list in string format separated by comma 4833 if isinstance(param_exomiser_hpo, str): 4834 param_exomiser_hpo = param_exomiser_hpo.split(",") 4835 4836 # Create HPO list 4837 for hpo in param_exomiser_hpo: 4838 hpo_clean = re.sub("[^0-9]", "", hpo) 4839 param_exomiser_phenotypicfeatures.append( 4840 { 4841 "type": { 4842 "id": f"HP:{hpo_clean}", 4843 "label": f"HP:{hpo_clean}", 4844 } 4845 } 4846 ) 4847 4848 # Add to dict 4849 param_exomiser_analysis_dict["phenopacket"][ 4850 "phenotypicFeatures" 4851 ] = param_exomiser_phenotypicfeatures 4852 4853 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4854 if not param_exomiser_phenotypicfeatures: 4855 for step in param_exomiser_analysis_dict.get( 4856 "analysis", {} 4857 ).get("steps", []): 4858 if "hiPhivePrioritiser" in step: 4859 param_exomiser_analysis_dict.get("analysis", {}).get( 4860 "steps", [] 4861 ).remove(step) 4862 4863 ### Add Input File ### 4864 4865 # Initial file name and htsFiles 4866 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4867 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4868 { 4869 "uri": tmp_vcf_name, 4870 "htsFormat": "VCF", 4871 "genomeAssembly": assembly, 4872 } 4873 ] 4874 4875 ### Add metaData ### 4876 4877 # If metaData not in analysis dict 4878 if "metaData" not in param_exomiser_analysis_dict: 4879 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4880 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4881 "createdBy": "howard", 4882 "phenopacketSchemaVersion": 1, 4883 } 4884 4885 ### OutputOptions ### 4886 4887 # Init output result folder 4888 output_results = os.path.join(tmp_dir, "results") 4889 4890 # If no outputOptions in analysis dict 4891 if "outputOptions" not in param_exomiser_analysis_dict: 4892 4893 # default output formats 4894 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4895 4896 # Get outputOptions in param 4897 output_options = param_exomiser.get("outputOptions", None) 4898 4899 # If no output_options in param -> check 4900 if not output_options: 4901 output_options = { 4902 "outputContributingVariantsOnly": False, 4903 "numGenes": 0, 4904 "outputFormats": defaut_output_formats, 4905 } 4906 4907 # Replace outputDirectory in output options 4908 output_options["outputDirectory"] = output_results 4909 output_options["outputFileName"] = "howard" 4910 4911 # Add outputOptions in analysis dict 4912 param_exomiser_analysis_dict["outputOptions"] = output_options 4913 4914 else: 4915 4916 # Replace output_results and output format (if exists in param) 4917 param_exomiser_analysis_dict["outputOptions"][ 4918 "outputDirectory" 4919 ] = output_results 4920 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4921 list( 4922 set( 4923 param_exomiser_analysis_dict.get( 4924 "outputOptions", {} 4925 ).get("outputFormats", []) 4926 + ["TSV_VARIANT", "VCF"] 4927 ) 4928 ) 4929 ) 4930 4931 # log 4932 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4933 4934 ### ANALYSIS FILE ### 4935 ##################### 4936 4937 ### Full JSON analysis config file ### 4938 4939 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4940 with open(exomiser_analysis, "w") as fp: 4941 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4942 4943 ### SPLIT analysis and sample config files 4944 4945 # Splitted analysis dict 4946 param_exomiser_analysis_dict_for_split = ( 4947 param_exomiser_analysis_dict.copy() 4948 ) 4949 4950 # Phenopacket JSON file 4951 exomiser_analysis_phenopacket = os.path.join( 4952 tmp_dir, "analysis_phenopacket.json" 4953 ) 4954 with open(exomiser_analysis_phenopacket, "w") as fp: 4955 json.dump( 4956 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4957 fp, 4958 indent=4, 4959 ) 4960 4961 # Analysis JSON file without Phenopacket parameters 4962 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4963 exomiser_analysis_analysis = os.path.join( 4964 tmp_dir, "analysis_analysis.json" 4965 ) 4966 with open(exomiser_analysis_analysis, "w") as fp: 4967 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4968 4969 ### INITAL VCF file ### 4970 ####################### 4971 4972 ### Create list of samples to use and include inti initial VCF file #### 4973 4974 # Subject (main sample) 4975 # Get sample ID in analysis dict 4976 sample_subject = ( 4977 param_exomiser_analysis_dict.get("phenopacket", {}) 4978 .get("subject", {}) 4979 .get("id", None) 4980 ) 4981 sample_proband = ( 4982 param_exomiser_analysis_dict.get("phenopacket", {}) 4983 .get("proband", {}) 4984 .get("subject", {}) 4985 .get("id", None) 4986 ) 4987 sample = [] 4988 if sample_subject: 4989 sample.append(sample_subject) 4990 if sample_proband: 4991 sample.append(sample_proband) 4992 4993 # Get sample ID within Pedigree 4994 pedigree_persons_list = ( 4995 param_exomiser_analysis_dict.get("phenopacket", {}) 4996 .get("pedigree", {}) 4997 .get("persons", {}) 4998 ) 4999 5000 # Create list with all sample ID in pedigree (if exists) 5001 pedigree_persons = [] 5002 for person in pedigree_persons_list: 5003 pedigree_persons.append(person.get("individualId")) 5004 5005 # Concat subject sample ID and samples ID in pedigreesamples 5006 samples = list(set(sample + pedigree_persons)) 5007 5008 # Check if sample list is not empty 5009 if not samples: 5010 log.error(f"No samples found") 5011 raise ValueError(f"No samples found") 5012 5013 # Create VCF with sample (either sample in param or first one by default) 5014 # Export VCF file 5015 self.export_variant_vcf( 5016 vcf_file=tmp_vcf_name, 5017 remove_info=True, 5018 add_samples=True, 5019 list_samples=samples, 5020 index=False, 5021 ) 5022 5023 ### Execute Exomiser ### 5024 ######################## 5025 5026 # Init command 5027 exomiser_command = "" 5028 5029 # Command exomiser options 5030 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 5031 5032 # Release 5033 exomiser_release = param_exomiser.get("release", None) 5034 if exomiser_release: 5035 # phenotype data version 5036 exomiser_options += ( 5037 f" --exomiser.phenotype.data-version={exomiser_release} " 5038 ) 5039 # data version 5040 exomiser_options += ( 5041 f" --exomiser.{assembly}.data-version={exomiser_release} " 5042 ) 5043 # variant white list 5044 variant_white_list_file = ( 5045 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 5046 ) 5047 if os.path.exists( 5048 os.path.join( 5049 databases_folders, assembly, variant_white_list_file 5050 ) 5051 ): 5052 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 5053 5054 # transcript_source 5055 transcript_source = param_exomiser.get( 5056 "transcript_source", None 5057 ) # ucsc, refseq, ensembl 5058 if transcript_source: 5059 exomiser_options += ( 5060 f" --exomiser.{assembly}.transcript-source={transcript_source} " 5061 ) 5062 5063 # If analysis contain proband param 5064 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 5065 "proband", {} 5066 ): 5067 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 5068 5069 # If no proband (usually uniq sample) 5070 else: 5071 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 5072 5073 # Log 5074 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 5075 5076 # Run command 5077 result = subprocess.call( 5078 exomiser_command_analysis.split(), stdout=subprocess.PIPE 5079 ) 5080 if result: 5081 log.error("Exomiser command failed") 5082 raise ValueError("Exomiser command failed") 5083 5084 ### RESULTS ### 5085 ############### 5086 5087 ### Annotate with TSV fields ### 5088 5089 # Init result tsv file 5090 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 5091 5092 # Init result tsv file 5093 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 5094 5095 # Parse TSV file and explode columns in INFO field 5096 if exomiser_to_info and os.path.exists(output_results_tsv): 5097 5098 # Log 5099 log.debug("Exomiser columns to VCF INFO field") 5100 5101 # Retrieve columns and types 5102 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 5103 output_results_tsv_df = self.get_query_to_df(query) 5104 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 5105 5106 # Init concat fields for update 5107 sql_query_update_concat_fields = [] 5108 5109 # Fields to avoid 5110 fields_to_avoid = [ 5111 "CONTIG", 5112 "START", 5113 "END", 5114 "REF", 5115 "ALT", 5116 "QUAL", 5117 "FILTER", 5118 "GENOTYPE", 5119 ] 5120 5121 # List all columns to add into header 5122 for header_column in output_results_tsv_columns: 5123 5124 # If header column is enable 5125 if header_column not in fields_to_avoid: 5126 5127 # Header info type 5128 header_info_type = "String" 5129 header_column_df = output_results_tsv_df[header_column] 5130 header_column_df_dtype = header_column_df.dtype 5131 if header_column_df_dtype == object: 5132 if ( 5133 pd.to_numeric(header_column_df, errors="coerce") 5134 .notnull() 5135 .all() 5136 ): 5137 header_info_type = "Float" 5138 else: 5139 header_info_type = "Integer" 5140 5141 # Header info 5142 characters_to_validate = ["-"] 5143 pattern = "[" + "".join(characters_to_validate) + "]" 5144 header_info_name = re.sub( 5145 pattern, 5146 "_", 5147 f"Exomiser_{header_column}".replace("#", ""), 5148 ) 5149 header_info_number = "." 5150 header_info_description = ( 5151 f"Exomiser {header_column} annotation" 5152 ) 5153 header_info_source = "Exomiser" 5154 header_info_version = "unknown" 5155 header_info_code = CODE_TYPE_MAP[header_info_type] 5156 vcf_reader.infos[header_info_name] = vcf.parser._Info( 5157 header_info_name, 5158 header_info_number, 5159 header_info_type, 5160 header_info_description, 5161 header_info_source, 5162 header_info_version, 5163 header_info_code, 5164 ) 5165 5166 # Add field to add for update to concat fields 5167 sql_query_update_concat_fields.append( 5168 f""" 5169 CASE 5170 WHEN table_parquet."{header_column}" NOT IN ('','.') 5171 THEN concat( 5172 '{header_info_name}=', 5173 table_parquet."{header_column}", 5174 ';' 5175 ) 5176 5177 ELSE '' 5178 END 5179 """ 5180 ) 5181 5182 # Update query 5183 sql_query_update = f""" 5184 UPDATE {table_variants} as table_variants 5185 SET INFO = concat( 5186 CASE 5187 WHEN INFO NOT IN ('', '.') 5188 THEN INFO 5189 ELSE '' 5190 END, 5191 CASE 5192 WHEN table_variants.INFO NOT IN ('','.') 5193 THEN ';' 5194 ELSE '' 5195 END, 5196 ( 5197 SELECT 5198 concat( 5199 {",".join(sql_query_update_concat_fields)} 5200 ) 5201 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 5202 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 5203 AND table_parquet.\"START\" = table_variants.\"POS\" 5204 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5205 AND table_parquet.\"REF\" = table_variants.\"REF\" 5206 ) 5207 ) 5208 ; 5209 """ 5210 5211 # Update 5212 self.conn.execute(sql_query_update) 5213 5214 ### Annotate with VCF INFO field ### 5215 5216 # Init result VCF file 5217 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 5218 5219 # If VCF exists 5220 if os.path.exists(output_results_vcf): 5221 5222 # Log 5223 log.debug("Exomiser result VCF update variants") 5224 5225 # Find Exomiser INFO field annotation in header 5226 with gzip.open(output_results_vcf, "rt") as f: 5227 header_list = self.read_vcf_header(f) 5228 exomiser_vcf_header = vcf.Reader( 5229 io.StringIO("\n".join(header_list)) 5230 ) 5231 5232 # Add annotation INFO field to header 5233 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 5234 5235 # Update variants with VCF 5236 self.update_from_vcf(output_results_vcf) 5237 5238 return True 5239 5240 def annotation_snpeff(self, threads: int = None) -> None: 5241 """ 5242 This function annotate with snpEff 5243 5244 :param threads: The number of threads to use 5245 :return: the value of the variable "return_value". 5246 """ 5247 5248 # DEBUG 5249 log.debug("Start annotation with snpeff databases") 5250 5251 # Threads 5252 if not threads: 5253 threads = self.get_threads() 5254 log.debug("Threads: " + str(threads)) 5255 5256 # DEBUG 5257 delete_tmp = True 5258 if self.get_config().get("verbosity", "warning") in ["debug"]: 5259 delete_tmp = False 5260 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5261 5262 # Config 5263 config = self.get_config() 5264 log.debug("Config: " + str(config)) 5265 5266 # Config - Folders - Databases 5267 databases_folders = ( 5268 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 5269 ) 5270 log.debug("Databases annotations: " + str(databases_folders)) 5271 5272 # Config - snpEff bin command 5273 snpeff_bin_command = get_bin_command( 5274 bin="snpEff.jar", 5275 tool="snpeff", 5276 bin_type="jar", 5277 config=config, 5278 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 5279 ) 5280 if not snpeff_bin_command: 5281 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 5282 log.error(msg_err) 5283 raise ValueError(msg_err) 5284 5285 # Config - snpEff databases 5286 snpeff_databases = ( 5287 config.get("folders", {}) 5288 .get("databases", {}) 5289 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 5290 ) 5291 snpeff_databases = full_path(snpeff_databases) 5292 if snpeff_databases is not None and snpeff_databases != "": 5293 log.debug(f"Create snpEff databases folder") 5294 if not os.path.exists(snpeff_databases): 5295 os.makedirs(snpeff_databases) 5296 5297 # Param 5298 param = self.get_param() 5299 log.debug("Param: " + str(param)) 5300 5301 # Param 5302 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 5303 log.debug("Options: " + str(options)) 5304 5305 # Param - Assembly 5306 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5307 5308 # Param - Options 5309 snpeff_options = ( 5310 param.get("annotation", {}).get("snpeff", {}).get("options", "") 5311 ) 5312 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 5313 snpeff_csvstats = ( 5314 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 5315 ) 5316 if snpeff_stats: 5317 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 5318 snpeff_stats = full_path(snpeff_stats) 5319 snpeff_options += f" -stats {snpeff_stats}" 5320 if snpeff_csvstats: 5321 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 5322 snpeff_csvstats = full_path(snpeff_csvstats) 5323 snpeff_options += f" -csvStats {snpeff_csvstats}" 5324 5325 # Data 5326 table_variants = self.get_table_variants() 5327 5328 # Check if not empty 5329 log.debug("Check if not empty") 5330 sql_query_chromosomes = ( 5331 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5332 ) 5333 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 5334 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5335 log.info(f"VCF empty") 5336 return 5337 5338 # Export in VCF 5339 log.debug("Create initial file to annotate") 5340 tmp_vcf = NamedTemporaryFile( 5341 prefix=self.get_prefix(), 5342 dir=self.get_tmp_dir(), 5343 suffix=".vcf.gz", 5344 delete=True, 5345 ) 5346 tmp_vcf_name = tmp_vcf.name 5347 5348 # VCF header 5349 vcf_reader = self.get_header() 5350 log.debug("Initial header: " + str(vcf_reader.infos)) 5351 5352 # Existing annotations 5353 for vcf_annotation in self.get_header().infos: 5354 5355 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5356 log.debug( 5357 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5358 ) 5359 5360 # Memory limit 5361 # if config.get("memory", None): 5362 # memory_limit = config.get("memory", "8G") 5363 # else: 5364 # memory_limit = "8G" 5365 memory_limit = self.get_memory("8G") 5366 log.debug(f"memory_limit: {memory_limit}") 5367 5368 # snpEff java options 5369 snpeff_java_options = ( 5370 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5371 ) 5372 log.debug(f"Exomiser java options: {snpeff_java_options}") 5373 5374 force_update_annotation = True 5375 5376 if "ANN" not in self.get_header().infos or force_update_annotation: 5377 5378 # Check snpEff database 5379 log.debug(f"Check snpEff databases {[assembly]}") 5380 databases_download_snpeff( 5381 folder=snpeff_databases, assemblies=[assembly], config=config 5382 ) 5383 5384 # Export VCF file 5385 self.export_variant_vcf( 5386 vcf_file=tmp_vcf_name, 5387 remove_info=True, 5388 add_samples=False, 5389 index=True, 5390 ) 5391 5392 # Tmp file 5393 err_files = [] 5394 tmp_annotate_vcf = NamedTemporaryFile( 5395 prefix=self.get_prefix(), 5396 dir=self.get_tmp_dir(), 5397 suffix=".vcf", 5398 delete=False, 5399 ) 5400 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5401 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5402 err_files.append(tmp_annotate_vcf_name_err) 5403 5404 # Command 5405 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5406 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5407 run_parallel_commands([snpeff_command], 1) 5408 5409 # Error messages 5410 log.info(f"Error/Warning messages:") 5411 error_message_command_all = [] 5412 error_message_command_warning = [] 5413 error_message_command_err = [] 5414 for err_file in err_files: 5415 with open(err_file, "r") as f: 5416 for line in f: 5417 message = line.strip() 5418 error_message_command_all.append(message) 5419 if line.startswith("[W::"): 5420 error_message_command_warning.append(message) 5421 if line.startswith("[E::"): 5422 error_message_command_err.append(f"{err_file}: " + message) 5423 # log info 5424 for message in list( 5425 set(error_message_command_err + error_message_command_warning) 5426 ): 5427 log.info(f" {message}") 5428 # debug info 5429 for message in list(set(error_message_command_all)): 5430 log.debug(f" {message}") 5431 # failed 5432 if len(error_message_command_err): 5433 log.error("Annotation failed: Error in commands") 5434 raise ValueError("Annotation failed: Error in commands") 5435 5436 # Find annotation in header 5437 with open(tmp_annotate_vcf_name, "rt") as f: 5438 header_list = self.read_vcf_header(f) 5439 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5440 5441 for ann in annovar_vcf_header.infos: 5442 if ann not in self.get_header().infos: 5443 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5444 5445 # Update variants 5446 log.info(f"Annotation - Updating...") 5447 self.update_from_vcf(tmp_annotate_vcf_name) 5448 5449 else: 5450 if "ANN" in self.get_header().infos: 5451 log.debug(f"Existing snpEff annotations in VCF") 5452 if force_update_annotation: 5453 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 5454 5455 def annotation_annovar(self, threads: int = None) -> None: 5456 """ 5457 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5458 annotations 5459 5460 :param threads: number of threads to use 5461 :return: the value of the variable "return_value". 5462 """ 5463 5464 # DEBUG 5465 log.debug("Start annotation with Annovar databases") 5466 5467 # Threads 5468 if not threads: 5469 threads = self.get_threads() 5470 log.debug("Threads: " + str(threads)) 5471 5472 # Tmp en Err files 5473 tmp_files = [] 5474 err_files = [] 5475 5476 # DEBUG 5477 delete_tmp = True 5478 if self.get_config().get("verbosity", "warning") in ["debug"]: 5479 delete_tmp = False 5480 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5481 5482 # Config 5483 config = self.get_config() 5484 log.debug("Config: " + str(config)) 5485 5486 # Config - Folders - Databases 5487 databases_folders = ( 5488 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5489 ) 5490 log.debug("Databases annotations: " + str(databases_folders)) 5491 5492 # Config - annovar bin command 5493 annovar_bin_command = get_bin_command( 5494 bin="table_annovar.pl", 5495 tool="annovar", 5496 bin_type="perl", 5497 config=config, 5498 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5499 ) 5500 if not annovar_bin_command: 5501 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5502 log.error(msg_err) 5503 raise ValueError(msg_err) 5504 5505 # Config - BCFTools bin command 5506 bcftools_bin_command = get_bin_command( 5507 bin="bcftools", 5508 tool="bcftools", 5509 bin_type="bin", 5510 config=config, 5511 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5512 ) 5513 if not bcftools_bin_command: 5514 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5515 log.error(msg_err) 5516 raise ValueError(msg_err) 5517 5518 # Config - annovar databases 5519 annovar_databases = ( 5520 config.get("folders", {}) 5521 .get("databases", {}) 5522 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5523 ) 5524 if annovar_databases is not None: 5525 if isinstance(annovar_databases, list): 5526 annovar_databases = full_path(annovar_databases[0]) 5527 log.warning(f"Annovar databases folder '{annovar_databases}' selected") 5528 annovar_databases = full_path(annovar_databases) 5529 if not os.path.exists(annovar_databases): 5530 log.info(f"Annovar databases folder '{annovar_databases}' created") 5531 Path(annovar_databases).mkdir(parents=True, exist_ok=True) 5532 else: 5533 msg_err = f"Annovar databases configuration failed" 5534 log.error(msg_err) 5535 raise ValueError(msg_err) 5536 5537 # Param 5538 param = self.get_param() 5539 log.debug("Param: " + str(param)) 5540 5541 # Param - options 5542 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5543 log.debug("Options: " + str(options)) 5544 5545 # Param - annotations 5546 annotations = ( 5547 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5548 ) 5549 log.debug("Annotations: " + str(annotations)) 5550 5551 # Param - Assembly 5552 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5553 5554 # Annovar database assembly 5555 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5556 if annovar_databases_assembly != "" and not os.path.exists( 5557 annovar_databases_assembly 5558 ): 5559 os.makedirs(annovar_databases_assembly) 5560 5561 # Data 5562 table_variants = self.get_table_variants() 5563 5564 # Check if not empty 5565 log.debug("Check if not empty") 5566 sql_query_chromosomes = ( 5567 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5568 ) 5569 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5570 if not sql_query_chromosomes_df["count"][0]: 5571 log.info(f"VCF empty") 5572 return 5573 5574 # VCF header 5575 vcf_reader = self.get_header() 5576 log.debug("Initial header: " + str(vcf_reader.infos)) 5577 5578 # Existing annotations 5579 for vcf_annotation in self.get_header().infos: 5580 5581 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5582 log.debug( 5583 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5584 ) 5585 5586 force_update_annotation = True 5587 5588 if annotations: 5589 5590 commands = [] 5591 tmp_annotates_vcf_name_list = [] 5592 5593 # Export in VCF 5594 log.debug("Create initial file to annotate") 5595 tmp_vcf = NamedTemporaryFile( 5596 prefix=self.get_prefix(), 5597 dir=self.get_tmp_dir(), 5598 suffix=".vcf.gz", 5599 delete=False, 5600 ) 5601 tmp_vcf_name = tmp_vcf.name 5602 tmp_files.append(tmp_vcf_name) 5603 tmp_files.append(tmp_vcf_name + ".tbi") 5604 5605 # Export VCF file 5606 self.export_variant_vcf( 5607 vcf_file=tmp_vcf_name, 5608 remove_info=".", 5609 add_samples=False, 5610 index=True, 5611 ) 5612 5613 # Create file for field rename 5614 log.debug("Create file for field rename") 5615 tmp_rename = NamedTemporaryFile( 5616 prefix=self.get_prefix(), 5617 dir=self.get_tmp_dir(), 5618 suffix=".rename", 5619 delete=False, 5620 ) 5621 tmp_rename_name = tmp_rename.name 5622 tmp_files.append(tmp_rename_name) 5623 5624 # Check Annovar database 5625 log.debug( 5626 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5627 ) 5628 databases_download_annovar( 5629 folder=annovar_databases, 5630 files=list(annotations.keys()), 5631 assemblies=[assembly], 5632 ) 5633 5634 for annotation in annotations: 5635 annotation_fields = annotations[annotation] 5636 5637 if not annotation_fields: 5638 annotation_fields = {"INFO": None} 5639 5640 log.info(f"Annotations Annovar - database '{annotation}'") 5641 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5642 5643 # Tmp file for annovar 5644 err_files = [] 5645 tmp_annotate_vcf_directory = TemporaryDirectory( 5646 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5647 ) 5648 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5649 tmp_annotate_vcf_name_annovar = ( 5650 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5651 ) 5652 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5653 err_files.append(tmp_annotate_vcf_name_err) 5654 tmp_files.append(tmp_annotate_vcf_name_err) 5655 5656 # Tmp file final vcf annotated by annovar 5657 tmp_annotate_vcf = NamedTemporaryFile( 5658 prefix=self.get_prefix(), 5659 dir=self.get_tmp_dir(), 5660 suffix=".vcf.gz", 5661 delete=False, 5662 ) 5663 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5664 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5665 tmp_files.append(tmp_annotate_vcf_name) 5666 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5667 5668 # Number of fields 5669 annotation_list = [] 5670 annotation_renamed_list = [] 5671 5672 for annotation_field in annotation_fields: 5673 5674 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5675 annotation_fields_new_name = annotation_fields.get( 5676 annotation_field, annotation_field 5677 ) 5678 if not annotation_fields_new_name: 5679 annotation_fields_new_name = annotation_field 5680 5681 if ( 5682 force_update_annotation 5683 or annotation_fields_new_name not in self.get_header().infos 5684 ): 5685 annotation_list.append(annotation_field) 5686 annotation_renamed_list.append(annotation_fields_new_name) 5687 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5688 log.warning( 5689 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5690 ) 5691 5692 # Add rename info 5693 run_parallel_commands( 5694 [ 5695 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5696 ], 5697 1, 5698 ) 5699 5700 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5701 log.debug("annotation_list: " + str(annotation_list)) 5702 5703 # protocol 5704 protocol = annotation 5705 5706 # argument 5707 argument = "" 5708 5709 # operation 5710 operation = "f" 5711 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5712 "ensGene" 5713 ): 5714 operation = "g" 5715 if options.get("genebase", None): 5716 argument = f"""'{options.get("genebase","")}'""" 5717 elif annotation in ["cytoBand"]: 5718 operation = "r" 5719 5720 # argument option 5721 argument_option = "" 5722 if argument != "": 5723 argument_option = " --argument " + argument 5724 5725 # command options 5726 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5727 for option in options: 5728 if option not in ["genebase"]: 5729 command_options += f""" --{option}={options[option]}""" 5730 5731 # Command 5732 5733 # Command - Annovar 5734 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5735 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5736 5737 # Command - start pipe 5738 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5739 5740 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5741 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5742 5743 # Command - Special characters (refGene annotation) 5744 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5745 5746 # Command - Clean empty fields (with value ".") 5747 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5748 5749 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5750 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5751 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5752 # for ann in annotation_renamed_list: 5753 for ann in annotation_list: 5754 annovar_fields_to_keep.append(f"^INFO/{ann}") 5755 5756 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5757 5758 # Command - indexing 5759 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5760 5761 log.debug(f"Annotation - Annovar command: {command_annovar}") 5762 run_parallel_commands([command_annovar], 1) 5763 5764 # Error messages 5765 log.info(f"Error/Warning messages:") 5766 error_message_command_all = [] 5767 error_message_command_warning = [] 5768 error_message_command_err = [] 5769 for err_file in err_files: 5770 with open(err_file, "r") as f: 5771 for line in f: 5772 message = line.strip() 5773 error_message_command_all.append(message) 5774 if line.startswith("[W::") or line.startswith("WARNING"): 5775 error_message_command_warning.append(message) 5776 if line.startswith("[E::") or line.startswith("ERROR"): 5777 error_message_command_err.append( 5778 f"{err_file}: " + message 5779 ) 5780 # log info 5781 for message in list( 5782 set(error_message_command_err + error_message_command_warning) 5783 ): 5784 log.info(f" {message}") 5785 # debug info 5786 for message in list(set(error_message_command_all)): 5787 log.debug(f" {message}") 5788 # failed 5789 if len(error_message_command_err): 5790 log.error("Annotation failed: Error in commands") 5791 raise ValueError("Annotation failed: Error in commands") 5792 5793 if tmp_annotates_vcf_name_list: 5794 5795 # List of annotated files 5796 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5797 5798 # Tmp file 5799 tmp_annotate_vcf = NamedTemporaryFile( 5800 prefix=self.get_prefix(), 5801 dir=self.get_tmp_dir(), 5802 suffix=".vcf.gz", 5803 delete=False, 5804 ) 5805 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5806 tmp_files.append(tmp_annotate_vcf_name) 5807 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5808 err_files.append(tmp_annotate_vcf_name_err) 5809 tmp_files.append(tmp_annotate_vcf_name_err) 5810 5811 # Command merge 5812 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5813 log.info( 5814 f"Annotation Annovar - Annotation merging " 5815 + str(len(tmp_annotates_vcf_name_list)) 5816 + " annotated files" 5817 ) 5818 log.debug(f"Annotation - merge command: {merge_command}") 5819 run_parallel_commands([merge_command], 1) 5820 5821 # Find annotation in header 5822 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5823 header_list = self.read_vcf_header(f) 5824 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5825 5826 for ann in annovar_vcf_header.infos: 5827 if ann not in self.get_header().infos: 5828 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5829 5830 # Update variants 5831 log.info(f"Annotation Annovar - Updating...") 5832 self.update_from_vcf(tmp_annotate_vcf_name) 5833 5834 # Clean files 5835 # Tmp file remove command 5836 if True: 5837 tmp_files_remove_command = "" 5838 if tmp_files: 5839 tmp_files_remove_command = " ".join(tmp_files) 5840 clean_command = f" rm -f {tmp_files_remove_command} " 5841 log.debug(f"Annotation Annovar - Annotation cleaning ") 5842 log.debug(f"Annotation - cleaning command: {clean_command}") 5843 run_parallel_commands([clean_command], 1) 5844 5845 # Parquet 5846 def annotation_parquet(self, threads: int = None) -> None: 5847 """ 5848 It takes a VCF file, and annotates it with a parquet file 5849 5850 :param threads: number of threads to use for the annotation 5851 :return: the value of the variable "result". 5852 """ 5853 5854 # DEBUG 5855 log.debug("Start annotation with parquet databases") 5856 5857 # Threads 5858 if not threads: 5859 threads = self.get_threads() 5860 log.debug("Threads: " + str(threads)) 5861 5862 # DEBUG 5863 delete_tmp = True 5864 if self.get_config().get("verbosity", "warning") in ["debug"]: 5865 delete_tmp = False 5866 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5867 5868 # Config 5869 databases_folders = set( 5870 self.get_config() 5871 .get("folders", {}) 5872 .get("databases", {}) 5873 .get("annotations", ["."]) 5874 + self.get_config() 5875 .get("folders", {}) 5876 .get("databases", {}) 5877 .get("parquet", ["."]) 5878 ) 5879 log.debug("Databases annotations: " + str(databases_folders)) 5880 5881 # Param 5882 annotations = ( 5883 self.get_param() 5884 .get("annotation", {}) 5885 .get("parquet", {}) 5886 .get("annotations", None) 5887 ) 5888 log.debug("Annotations: " + str(annotations)) 5889 5890 # Assembly 5891 assembly = self.get_param().get( 5892 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5893 ) 5894 5895 # Force Update Annotation 5896 force_update_annotation = ( 5897 self.get_param() 5898 .get("annotation", {}) 5899 .get("options", {}) 5900 .get("annotations_update", False) 5901 ) 5902 log.debug(f"force_update_annotation={force_update_annotation}") 5903 force_append_annotation = ( 5904 self.get_param() 5905 .get("annotation", {}) 5906 .get("options", {}) 5907 .get("annotations_append", False) 5908 ) 5909 log.debug(f"force_append_annotation={force_append_annotation}") 5910 5911 # Data 5912 table_variants = self.get_table_variants() 5913 5914 # Check if not empty 5915 log.debug("Check if not empty") 5916 sql_query_chromosomes_df = self.get_query_to_df( 5917 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5918 ) 5919 if not sql_query_chromosomes_df["count"][0]: 5920 log.info(f"VCF empty") 5921 return 5922 5923 # VCF header 5924 vcf_reader = self.get_header() 5925 log.debug("Initial header: " + str(vcf_reader.infos)) 5926 5927 # Nb Variants POS 5928 log.debug("NB Variants Start") 5929 nb_variants = self.conn.execute( 5930 f"SELECT count(*) AS count FROM variants" 5931 ).fetchdf()["count"][0] 5932 log.debug("NB Variants Stop") 5933 5934 # Existing annotations 5935 for vcf_annotation in self.get_header().infos: 5936 5937 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5938 log.debug( 5939 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5940 ) 5941 5942 # Added columns 5943 added_columns = [] 5944 5945 # drop indexes 5946 log.debug(f"Drop indexes...") 5947 self.drop_indexes() 5948 5949 if annotations: 5950 5951 if "ALL" in annotations: 5952 5953 all_param = annotations.get("ALL", {}) 5954 all_param_formats = all_param.get("formats", None) 5955 all_param_releases = all_param.get("releases", None) 5956 5957 databases_infos_dict = self.scan_databases( 5958 database_formats=all_param_formats, 5959 database_releases=all_param_releases, 5960 ) 5961 for database_infos in databases_infos_dict.keys(): 5962 if database_infos not in annotations: 5963 annotations[database_infos] = {"INFO": None} 5964 5965 for annotation in annotations: 5966 5967 if annotation in ["ALL"]: 5968 continue 5969 5970 # Annotation Name 5971 annotation_name = os.path.basename(annotation) 5972 5973 # Annotation fields 5974 annotation_fields = annotations[annotation] 5975 if not annotation_fields: 5976 annotation_fields = {"INFO": None} 5977 5978 log.debug(f"Annotation '{annotation_name}'") 5979 log.debug( 5980 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5981 ) 5982 5983 # Create Database 5984 database = Database( 5985 database=annotation, 5986 databases_folders=databases_folders, 5987 assembly=assembly, 5988 ) 5989 5990 # Find files 5991 parquet_file = database.get_database() 5992 parquet_hdr_file = database.get_header_file() 5993 parquet_type = database.get_type() 5994 5995 # Check if files exists 5996 if not parquet_file or not parquet_hdr_file: 5997 msg_err_list = [] 5998 if not parquet_file: 5999 msg_err_list.append( 6000 f"Annotation failed: Annotation file not found" 6001 ) 6002 if parquet_file and not parquet_hdr_file: 6003 msg_err_list.append( 6004 f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'" 6005 ) 6006 6007 log.error(". ".join(msg_err_list)) 6008 raise ValueError(". ".join(msg_err_list)) 6009 else: 6010 # Get parquet connexion 6011 parquet_sql_attach = database.get_sql_database_attach( 6012 output="query" 6013 ) 6014 if parquet_sql_attach: 6015 self.conn.execute(parquet_sql_attach) 6016 parquet_file_link = database.get_sql_database_link() 6017 # Log 6018 log.debug( 6019 f"Annotation '{annotation_name}' - file: " 6020 + str(parquet_file) 6021 + " and " 6022 + str(parquet_hdr_file) 6023 ) 6024 6025 # Database full header columns 6026 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 6027 parquet_hdr_file 6028 ) 6029 # Log 6030 log.debug( 6031 "Annotation database header columns : " 6032 + str(parquet_hdr_vcf_header_columns) 6033 ) 6034 6035 # Load header as VCF object 6036 parquet_hdr_vcf_header_infos = database.get_header().infos 6037 # Log 6038 log.debug( 6039 "Annotation database header: " 6040 + str(parquet_hdr_vcf_header_infos) 6041 ) 6042 6043 # Get extra infos 6044 parquet_columns = database.get_extra_columns() 6045 # Log 6046 log.debug("Annotation database Columns: " + str(parquet_columns)) 6047 6048 # Add extra columns if "ALL" in annotation_fields 6049 # if "ALL" in annotation_fields: 6050 # allow_add_extra_column = True 6051 if "ALL" in annotation_fields and database.get_extra_columns(): 6052 for extra_column in database.get_extra_columns(): 6053 if ( 6054 extra_column not in annotation_fields 6055 and extra_column.replace("INFO/", "") 6056 not in parquet_hdr_vcf_header_infos 6057 ): 6058 parquet_hdr_vcf_header_infos[extra_column] = ( 6059 vcf.parser._Info( 6060 extra_column, 6061 ".", 6062 "String", 6063 f"{extra_column} description", 6064 "unknown", 6065 "unknown", 6066 self.code_type_map["String"], 6067 ) 6068 ) 6069 6070 # For all fields in database 6071 annotation_fields_all = False 6072 if "ALL" in annotation_fields or "INFO" in annotation_fields: 6073 annotation_fields_all = True 6074 annotation_fields = { 6075 key: key for key in parquet_hdr_vcf_header_infos 6076 } 6077 6078 log.debug( 6079 "Annotation database header - All annotations added: " 6080 + str(annotation_fields) 6081 ) 6082 6083 # Init 6084 6085 # List of annotation fields to use 6086 sql_query_annotation_update_info_sets = [] 6087 6088 # List of annotation to agregate 6089 sql_query_annotation_to_agregate = [] 6090 6091 # Number of fields 6092 nb_annotation_field = 0 6093 6094 # Annotation fields processed 6095 annotation_fields_processed = [] 6096 6097 # Columns mapping 6098 map_columns = database.map_columns( 6099 columns=annotation_fields, prefixes=["INFO/"] 6100 ) 6101 6102 # Query dict for fields to remove (update option) 6103 query_dict_remove = {} 6104 6105 # Fetch Anotation fields 6106 for annotation_field in annotation_fields: 6107 6108 # annotation_field_column 6109 annotation_field_column = map_columns.get( 6110 annotation_field, "INFO" 6111 ) 6112 6113 # field new name, if parametered 6114 annotation_fields_new_name = annotation_fields.get( 6115 annotation_field, annotation_field 6116 ) 6117 if not annotation_fields_new_name: 6118 annotation_fields_new_name = annotation_field 6119 6120 # To annotate 6121 # force_update_annotation = True 6122 # force_append_annotation = True 6123 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 6124 if annotation_field in parquet_hdr_vcf_header_infos and ( 6125 force_update_annotation 6126 or force_append_annotation 6127 or ( 6128 annotation_fields_new_name 6129 not in self.get_header().infos 6130 ) 6131 ): 6132 6133 # Add field to annotation to process list 6134 annotation_fields_processed.append( 6135 annotation_fields_new_name 6136 ) 6137 6138 # explode infos for the field 6139 annotation_fields_new_name_info_msg = "" 6140 if ( 6141 force_update_annotation 6142 and annotation_fields_new_name 6143 in self.get_header().infos 6144 ): 6145 # Remove field from INFO 6146 query = f""" 6147 UPDATE {table_variants} as table_variants 6148 SET INFO = REGEXP_REPLACE( 6149 concat(table_variants.INFO,''), 6150 ';*{annotation_fields_new_name}=[^;]*', 6151 '' 6152 ) 6153 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 6154 """ 6155 annotation_fields_new_name_info_msg = " [update]" 6156 query_dict_remove[ 6157 f"remove 'INFO/{annotation_fields_new_name}'" 6158 ] = query 6159 6160 # Sep between fields in INFO 6161 nb_annotation_field += 1 6162 if nb_annotation_field > 1: 6163 annotation_field_sep = ";" 6164 else: 6165 annotation_field_sep = "" 6166 6167 log.info( 6168 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 6169 ) 6170 6171 # Add INFO field to header 6172 parquet_hdr_vcf_header_infos_number = ( 6173 parquet_hdr_vcf_header_infos[annotation_field].num 6174 or "." 6175 ) 6176 parquet_hdr_vcf_header_infos_type = ( 6177 parquet_hdr_vcf_header_infos[annotation_field].type 6178 or "String" 6179 ) 6180 parquet_hdr_vcf_header_infos_description = ( 6181 parquet_hdr_vcf_header_infos[annotation_field].desc 6182 or f"{annotation_field} description" 6183 ) 6184 parquet_hdr_vcf_header_infos_source = ( 6185 parquet_hdr_vcf_header_infos[annotation_field].source 6186 or "unknown" 6187 ) 6188 parquet_hdr_vcf_header_infos_version = ( 6189 parquet_hdr_vcf_header_infos[annotation_field].version 6190 or "unknown" 6191 ) 6192 6193 vcf_reader.infos[annotation_fields_new_name] = ( 6194 vcf.parser._Info( 6195 annotation_fields_new_name, 6196 parquet_hdr_vcf_header_infos_number, 6197 parquet_hdr_vcf_header_infos_type, 6198 parquet_hdr_vcf_header_infos_description, 6199 parquet_hdr_vcf_header_infos_source, 6200 parquet_hdr_vcf_header_infos_version, 6201 self.code_type_map[ 6202 parquet_hdr_vcf_header_infos_type 6203 ], 6204 ) 6205 ) 6206 6207 # Append 6208 if force_append_annotation: 6209 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 6210 else: 6211 query_case_when_append = "" 6212 6213 # Annotation/Update query fields 6214 # Found in INFO column 6215 if ( 6216 annotation_field_column == "INFO" 6217 and "INFO" in parquet_hdr_vcf_header_columns 6218 ): 6219 sql_query_annotation_update_info_sets.append( 6220 f""" 6221 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 6222 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 6223 ELSE '' 6224 END 6225 """ 6226 ) 6227 # Found in a specific column 6228 else: 6229 sql_query_annotation_update_info_sets.append( 6230 f""" 6231 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 6232 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 6233 ELSE '' 6234 END 6235 """ 6236 ) 6237 sql_query_annotation_to_agregate.append( 6238 f""" string_agg(table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6239 ) 6240 6241 # Not to annotate 6242 else: 6243 6244 if force_update_annotation: 6245 annotation_message = "forced" 6246 else: 6247 annotation_message = "skipped" 6248 6249 if annotation_field not in parquet_hdr_vcf_header_infos: 6250 log.warning( 6251 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 6252 ) 6253 if annotation_fields_new_name in self.get_header().infos: 6254 log.warning( 6255 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 6256 ) 6257 6258 # Check if ALL fields have to be annotated. Thus concat all INFO field 6259 # allow_annotation_full_info = True 6260 allow_annotation_full_info = not force_append_annotation 6261 6262 if parquet_type in ["regions"]: 6263 allow_annotation_full_info = False 6264 6265 if ( 6266 allow_annotation_full_info 6267 and nb_annotation_field == len(annotation_fields) 6268 and annotation_fields_all 6269 and ( 6270 "INFO" in parquet_hdr_vcf_header_columns 6271 and "INFO" in database.get_extra_columns() 6272 ) 6273 ): 6274 log.debug("Column INFO annotation enabled") 6275 sql_query_annotation_update_info_sets = [] 6276 sql_query_annotation_update_info_sets.append( 6277 f" table_parquet.INFO " 6278 ) 6279 6280 if sql_query_annotation_update_info_sets: 6281 6282 # Annotate 6283 log.info(f"Annotation '{annotation_name}' - Annotation...") 6284 6285 # Join query annotation update info sets for SQL 6286 sql_query_annotation_update_info_sets_sql = ",".join( 6287 sql_query_annotation_update_info_sets 6288 ) 6289 6290 # Check chromosomes list (and variants infos) 6291 sql_query_chromosomes = f""" 6292 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 6293 FROM {table_variants} as table_variants 6294 GROUP BY table_variants."#CHROM" 6295 ORDER BY table_variants."#CHROM" 6296 """ 6297 sql_query_chromosomes_df = self.conn.execute( 6298 sql_query_chromosomes 6299 ).df() 6300 sql_query_chromosomes_dict = { 6301 entry["CHROM"]: { 6302 "count": entry["count_variants"], 6303 "min": entry["min_variants"], 6304 "max": entry["max_variants"], 6305 } 6306 for index, entry in sql_query_chromosomes_df.iterrows() 6307 } 6308 6309 # Init 6310 nb_of_query = 0 6311 nb_of_variant_annotated = 0 6312 query_dict = query_dict_remove 6313 6314 # for chrom in sql_query_chromosomes_df["CHROM"]: 6315 for chrom in sql_query_chromosomes_dict: 6316 6317 # Number of variant by chromosome 6318 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 6319 chrom, {} 6320 ).get("count", 0) 6321 6322 log.debug( 6323 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 6324 ) 6325 6326 # Annotation with regions database 6327 if parquet_type in ["regions"]: 6328 sql_query_annotation_from_clause = f""" 6329 FROM ( 6330 SELECT 6331 '{chrom}' AS \"#CHROM\", 6332 table_variants_from.\"POS\" AS \"POS\", 6333 {",".join(sql_query_annotation_to_agregate)} 6334 FROM {table_variants} as table_variants_from 6335 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 6336 table_parquet_from."#CHROM" = '{chrom}' 6337 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 6338 AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 6339 ) 6340 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 6341 GROUP BY table_variants_from.\"POS\" 6342 ) 6343 as table_parquet 6344 """ 6345 6346 sql_query_annotation_where_clause = """ 6347 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6348 AND table_parquet.\"POS\" = table_variants.\"POS\" 6349 """ 6350 6351 # Annotation with variants database 6352 else: 6353 sql_query_annotation_from_clause = f""" 6354 FROM {parquet_file_link} as table_parquet 6355 """ 6356 sql_query_annotation_where_clause = f""" 6357 table_variants."#CHROM" = '{chrom}' 6358 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6359 AND table_parquet.\"POS\" = table_variants.\"POS\" 6360 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 6361 AND table_parquet.\"REF\" = table_variants.\"REF\" 6362 """ 6363 6364 # Create update query 6365 sql_query_annotation_chrom_interval_pos = f""" 6366 UPDATE {table_variants} as table_variants 6367 SET INFO = 6368 concat( 6369 CASE WHEN table_variants.INFO NOT IN ('','.') 6370 THEN table_variants.INFO 6371 ELSE '' 6372 END 6373 , 6374 CASE WHEN table_variants.INFO NOT IN ('','.') 6375 AND ( 6376 concat({sql_query_annotation_update_info_sets_sql}) 6377 ) 6378 NOT IN ('','.') 6379 THEN ';' 6380 ELSE '' 6381 END 6382 , 6383 {sql_query_annotation_update_info_sets_sql} 6384 ) 6385 {sql_query_annotation_from_clause} 6386 WHERE {sql_query_annotation_where_clause} 6387 ; 6388 """ 6389 6390 # Add update query to dict 6391 query_dict[ 6392 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6393 ] = sql_query_annotation_chrom_interval_pos 6394 6395 nb_of_query = len(query_dict) 6396 num_query = 0 6397 6398 # SET max_expression_depth TO x 6399 self.conn.execute("SET max_expression_depth TO 10000") 6400 6401 for query_name in query_dict: 6402 query = query_dict[query_name] 6403 num_query += 1 6404 log.info( 6405 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6406 ) 6407 result = self.conn.execute(query) 6408 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6409 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6410 log.info( 6411 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6412 ) 6413 6414 log.info( 6415 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6416 ) 6417 6418 else: 6419 6420 log.info( 6421 f"Annotation '{annotation_name}' - No Annotations available" 6422 ) 6423 6424 log.debug("Final header: " + str(vcf_reader.infos)) 6425 6426 # Remove added columns 6427 for added_column in added_columns: 6428 self.drop_column(column=added_column) 6429 6430 def annotation_splice(self, threads: int = None) -> None: 6431 """ 6432 This function annotate with snpEff 6433 6434 :param threads: The number of threads to use 6435 :return: the value of the variable "return_value". 6436 """ 6437 6438 # DEBUG 6439 log.debug("Start annotation with splice tools") 6440 6441 # Threads 6442 if not threads: 6443 threads = self.get_threads() 6444 log.debug("Threads: " + str(threads)) 6445 6446 # DEBUG 6447 delete_tmp = True 6448 if self.get_config().get("verbosity", "warning") in ["debug"]: 6449 delete_tmp = False 6450 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6451 6452 # Config 6453 config = self.get_config() 6454 log.debug("Config: " + str(config)) 6455 splice_config = config.get("tools", {}).get("splice", {}) 6456 if not splice_config: 6457 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6458 msg_err = "No Splice tool config" 6459 raise ValueError(msg_err) 6460 log.debug(f"splice_config: {splice_config}") 6461 6462 # Config - Folders - Databases 6463 databases_folders = ( 6464 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6465 ) 6466 log.debug("Databases annotations: " + str(databases_folders)) 6467 6468 # Splice docker image 6469 splice_docker_image = splice_config.get("docker").get("image") 6470 6471 # Pull splice image if it's not already there 6472 if not check_docker_image_exists(splice_docker_image): 6473 log.warning( 6474 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6475 ) 6476 try: 6477 command(f"docker pull {splice_config.get('docker').get('image')}") 6478 except subprocess.CalledProcessError: 6479 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6480 log.error(msg_err) 6481 raise ValueError(msg_err) 6482 6483 # Config - splice databases 6484 splice_databases = ( 6485 config.get("folders", {}) 6486 .get("databases", {}) 6487 .get("splice", DEFAULT_SPLICE_FOLDER) 6488 ) 6489 splice_databases = full_path(splice_databases) 6490 6491 # Param 6492 param = self.get_param() 6493 log.debug("Param: " + str(param)) 6494 6495 # Param 6496 options = param.get("annotation", {}).get("splice", {}).get("options", {}) 6497 log.debug("Options: " + str(options)) 6498 6499 # Data 6500 table_variants = self.get_table_variants() 6501 6502 # Check if not empty 6503 log.debug("Check if not empty") 6504 sql_query_chromosomes = ( 6505 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6506 ) 6507 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6508 log.info("VCF empty") 6509 return None 6510 6511 # Export in VCF 6512 log.debug("Create initial file to annotate") 6513 6514 # Create output folder / work folder 6515 if options.get("output_folder", ""): 6516 output_folder = options.get("output_folder", "") 6517 if not os.path.exists(output_folder): 6518 Path(output_folder).mkdir(parents=True, exist_ok=True) 6519 else: 6520 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6521 if not os.path.exists(output_folder): 6522 Path(output_folder).mkdir(parents=True, exist_ok=True) 6523 6524 if options.get("workdir", ""): 6525 workdir = options.get("workdir", "") 6526 else: 6527 workdir = "/work" 6528 6529 # Create tmp VCF file 6530 tmp_vcf = NamedTemporaryFile( 6531 prefix=self.get_prefix(), 6532 dir=output_folder, 6533 suffix=".vcf", 6534 delete=False, 6535 ) 6536 tmp_vcf_name = tmp_vcf.name 6537 6538 # VCF header 6539 header = self.get_header() 6540 6541 # Existing annotations 6542 for vcf_annotation in self.get_header().infos: 6543 6544 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6545 log.debug( 6546 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6547 ) 6548 6549 # Memory limit 6550 if config.get("memory", None): 6551 memory_limit = config.get("memory", "8G").upper() 6552 # upper() 6553 else: 6554 memory_limit = "8G" 6555 log.debug(f"memory_limit: {memory_limit}") 6556 6557 # Check number of variants to annotate 6558 where_clause_regex_spliceai = r"SpliceAI_\w+" 6559 where_clause_regex_spip = r"SPiP_\w+" 6560 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6561 df_list_of_variants_to_annotate = self.get_query_to_df( 6562 query=f""" SELECT * FROM variants {where_clause} """ 6563 ) 6564 if len(df_list_of_variants_to_annotate) == 0: 6565 log.warning( 6566 f"No variants to annotate with splice. Variants probably already annotated with splice" 6567 ) 6568 return None 6569 else: 6570 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6571 6572 # Export VCF file 6573 self.export_variant_vcf( 6574 vcf_file=tmp_vcf_name, 6575 remove_info=True, 6576 add_samples=True, 6577 index=False, 6578 where_clause=where_clause, 6579 ) 6580 mount = [f" -v {path}:{path}:rw" for path in [output_folder]] 6581 if any(value for value in splice_config.values() if value is None): 6582 log.warning("At least one splice config parameter is empty") 6583 # exit annotation_splice 6584 return None 6585 6586 # Params in splice nf 6587 def check_values(dico: dict): 6588 """ 6589 Ensure parameters for NF splice pipeline 6590 """ 6591 for key, val in dico.items(): 6592 if key == "genome": 6593 if any( 6594 assemb in options.get("genome", {}) 6595 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6596 ): 6597 yield f"--{key} hg19" 6598 elif any( 6599 assemb in options.get("genome", {}) 6600 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6601 ): 6602 yield f"--{key} hg38" 6603 elif ( 6604 (isinstance(val, str) and val) 6605 or isinstance(val, int) 6606 or isinstance(val, bool) 6607 ): 6608 yield f"--{key} {val}" 6609 6610 # Genome 6611 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6612 options["genome"] = genome 6613 # NF params 6614 nf_params = [] 6615 # Add options 6616 if options: 6617 log.debug(options) 6618 nf_params = list(check_values(options)) 6619 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6620 else: 6621 log.debug("No NF params provided") 6622 # Add threads 6623 if "threads" not in options.keys(): 6624 nf_params.append(f"--threads {threads}") 6625 # Genome path 6626 genome_path = find_genome( 6627 config.get("folders", {}) 6628 .get("databases", {}) 6629 .get("genomes", DEFAULT_GENOME_FOLDER), 6630 file=f"{genome}.fa", 6631 ) 6632 # Add genome path 6633 if not genome_path: 6634 raise ValueError( 6635 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6636 ) 6637 else: 6638 log.debug(f"Genome: {genome_path}") 6639 nf_params.append(f"--genome_path {genome_path}") 6640 6641 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6642 """ 6643 Setting up updated databases for SPiP and SpliceAI 6644 """ 6645 6646 try: 6647 6648 # SpliceAI assembly transcriptome 6649 spliceai_assembly = os.path.join( 6650 config.get("folders", {}).get("databases", {}).get("spliceai", {}), 6651 options.get("genome"), 6652 "transcriptome", 6653 ) 6654 spip_assembly = options.get("genome") 6655 6656 spip = find( 6657 f"transcriptome_{spip_assembly}.RData", 6658 config.get("folders", {}).get("databases", {}).get("spip", {}), 6659 ) 6660 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6661 log.debug(f"SPiP annotations: {spip}") 6662 log.debug(f"SpliceAI annotations: {spliceai}") 6663 if spip and spliceai: 6664 return [ 6665 f"--spip_transcriptome {spip}", 6666 f"--spliceai_transcriptome {spliceai}", 6667 ] 6668 else: 6669 log.warning( 6670 "Can't find splice databases in configuration, use annotations file from image" 6671 ) 6672 except TypeError: 6673 log.warning( 6674 "Can't find splice databases in configuration, use annotations file from image" 6675 ) 6676 return [] 6677 6678 # Add options, check if transcriptome option have already beend provided 6679 if ( 6680 "spip_transcriptome" not in nf_params 6681 and "spliceai_transcriptome" not in nf_params 6682 ): 6683 splice_reference = splice_annotations(options, config) 6684 if splice_reference: 6685 nf_params.extend(splice_reference) 6686 # nf_params.append(f"--output_folder {output_folder}") 6687 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6688 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6689 log.debug(cmd) 6690 splice_config["docker"]["command"] = cmd 6691 6692 # Ensure proxy is set 6693 proxy = [ 6694 f"-e {var}={os.getenv(var)}" 6695 for var in ["https_proxy", "http_proxy", "ftp_proxy"] 6696 if os.getenv(var) is not None 6697 ] 6698 docker_cmd = get_bin_command( 6699 tool="splice", 6700 bin_type="docker", 6701 config=config, 6702 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6703 add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}", 6704 ) 6705 # print(docker_cmd) 6706 # exit() 6707 # Docker debug 6708 # if splice_config.get("rm_container"): 6709 # rm_container = "--rm" 6710 # else: 6711 # rm_container = "" 6712 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6713 log.debug(docker_cmd) 6714 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6715 log.debug(res.stdout) 6716 if res.stderr: 6717 log.error(res.stderr) 6718 res.check_returncode() 6719 # Update variants 6720 log.info("Annotation - Updating...") 6721 # Test find output vcf 6722 log.debug( 6723 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6724 ) 6725 output_vcf = [] 6726 # Wrong folder to look in 6727 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6728 if ( 6729 files 6730 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6731 ): 6732 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6733 # log.debug(os.listdir(options.get("output_folder"))) 6734 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6735 if not output_vcf: 6736 log.debug( 6737 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6738 ) 6739 else: 6740 # Get new header from annotated vcf 6741 log.debug(f"Initial header: {len(header.infos)} fields") 6742 # Create new header with splice infos 6743 new_vcf = Variants(input=output_vcf[0]) 6744 new_vcf_header = new_vcf.get_header().infos 6745 for keys, infos in new_vcf_header.items(): 6746 if keys not in header.infos.keys(): 6747 header.infos[keys] = infos 6748 log.debug(f"New header: {len(header.infos)} fields") 6749 log.debug(f"Splice tmp output: {output_vcf[0]}") 6750 self.update_from_vcf(output_vcf[0]) 6751 6752 # Remove file 6753 remove_if_exists(output_vcf) 6754 6755 ### 6756 # Prioritization 6757 ### 6758 6759 def get_config_default(self, name: str) -> dict: 6760 """ 6761 The function `get_config_default` returns a dictionary containing default configurations for 6762 various calculations and prioritizations. 6763 6764 :param name: The `get_config_default` function returns a dictionary containing default 6765 configurations for different calculations and prioritizations. The `name` parameter is used to 6766 specify which specific configuration to retrieve from the dictionary 6767 :type name: str 6768 :return: The function `get_config_default` returns a dictionary containing default configuration 6769 settings for different calculations and prioritizations. The specific configuration settings are 6770 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6771 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6772 returned. If there is no match, an empty dictionary is returned. 6773 """ 6774 6775 config_default = { 6776 "calculations": { 6777 "variant_chr_pos_alt_ref": { 6778 "type": "sql", 6779 "name": "variant_chr_pos_alt_ref", 6780 "description": "Create a variant ID with chromosome, position, alt and ref", 6781 "available": False, 6782 "output_column_name": "variant_chr_pos_alt_ref", 6783 "output_column_type": "String", 6784 "output_column_description": "variant ID with chromosome, position, alt and ref", 6785 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6786 "operation_info": True, 6787 }, 6788 "VARTYPE": { 6789 "type": "sql", 6790 "name": "VARTYPE", 6791 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6792 "available": True, 6793 "table": "variants", 6794 "output_column_name": "VARTYPE", 6795 "output_column_type": "String", 6796 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6797 "operation_query": """ 6798 CASE 6799 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6800 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6801 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6802 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6803 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6804 ELSE 'UNDEFINED' 6805 END 6806 """, 6807 "info_fields": ["SVTYPE"], 6808 "operation_info": True, 6809 }, 6810 "snpeff_hgvs": { 6811 "type": "python", 6812 "name": "snpeff_hgvs", 6813 "description": "HGVS nomenclatures from snpEff annotation", 6814 "available": True, 6815 "function_name": "calculation_extract_snpeff_hgvs", 6816 "function_params": ["snpeff_hgvs", "ANN"], 6817 }, 6818 "snpeff_ann_explode": { 6819 "type": "python", 6820 "name": "snpeff_ann_explode", 6821 "description": "Explode snpEff annotations with uniquify values", 6822 "available": True, 6823 "function_name": "calculation_snpeff_ann_explode", 6824 "function_params": [False, "fields", "snpeff_", "ANN"], 6825 }, 6826 "snpeff_ann_explode_uniquify": { 6827 "type": "python", 6828 "name": "snpeff_ann_explode_uniquify", 6829 "description": "Explode snpEff annotations", 6830 "available": True, 6831 "function_name": "calculation_snpeff_ann_explode", 6832 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6833 }, 6834 "snpeff_ann_explode_json": { 6835 "type": "python", 6836 "name": "snpeff_ann_explode_json", 6837 "description": "Explode snpEff annotations in JSON format", 6838 "available": True, 6839 "function_name": "calculation_snpeff_ann_explode", 6840 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6841 }, 6842 "NOMEN": { 6843 "type": "python", 6844 "name": "NOMEN", 6845 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)", 6846 "available": True, 6847 "function_name": "calculation_extract_nomen", 6848 "function_params": [], 6849 }, 6850 "RENAME_INFO_FIELDS": { 6851 "type": "python", 6852 "name": "RENAME_INFO_FIELDS", 6853 "description": "Rename or remove INFO/tags", 6854 "available": True, 6855 "function_name": "calculation_rename_info_fields", 6856 "function_params": [], 6857 }, 6858 "FINDBYPIPELINE": { 6859 "type": "python", 6860 "name": "FINDBYPIPELINE", 6861 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6862 "available": True, 6863 "function_name": "calculation_find_by_pipeline", 6864 "function_params": ["findbypipeline"], 6865 }, 6866 "FINDBYSAMPLE": { 6867 "type": "python", 6868 "name": "FINDBYSAMPLE", 6869 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6870 "available": True, 6871 "function_name": "calculation_find_by_pipeline", 6872 "function_params": ["findbysample"], 6873 }, 6874 "GENOTYPECONCORDANCE": { 6875 "type": "python", 6876 "name": "GENOTYPECONCORDANCE", 6877 "description": "Concordance of genotype for multi caller VCF", 6878 "available": True, 6879 "function_name": "calculation_genotype_concordance", 6880 "function_params": [], 6881 }, 6882 "BARCODE": { 6883 "type": "python", 6884 "name": "BARCODE", 6885 "description": "BARCODE as VaRank tool", 6886 "available": True, 6887 "function_name": "calculation_barcode", 6888 "function_params": [], 6889 }, 6890 "BARCODEFAMILY": { 6891 "type": "python", 6892 "name": "BARCODEFAMILY", 6893 "description": "BARCODEFAMILY as VaRank tool", 6894 "available": True, 6895 "function_name": "calculation_barcode_family", 6896 "function_params": ["BCF"], 6897 }, 6898 "TRIO": { 6899 "type": "python", 6900 "name": "TRIO", 6901 "description": "Inheritance for a trio family", 6902 "available": True, 6903 "function_name": "calculation_trio", 6904 "function_params": [], 6905 }, 6906 "VAF": { 6907 "type": "python", 6908 "name": "VAF", 6909 "description": "Variant Allele Frequency (VAF) harmonization", 6910 "available": True, 6911 "function_name": "calculation_vaf_normalization", 6912 "function_params": [], 6913 }, 6914 "VAF_stats": { 6915 "type": "python", 6916 "name": "VAF_stats", 6917 "description": "Variant Allele Frequency (VAF) statistics", 6918 "available": True, 6919 "function_name": "calculation_genotype_stats", 6920 "function_params": ["VAF"], 6921 }, 6922 "DP_stats": { 6923 "type": "python", 6924 "name": "DP_stats", 6925 "description": "Depth (DP) statistics", 6926 "available": True, 6927 "function_name": "calculation_genotype_stats", 6928 "function_params": ["DP"], 6929 }, 6930 "variant_id": { 6931 "type": "python", 6932 "name": "variant_id", 6933 "description": "Variant ID generated from variant position and type", 6934 "available": True, 6935 "function_name": "calculation_variant_id", 6936 "function_params": [], 6937 }, 6938 "transcripts_json": { 6939 "type": "python", 6940 "name": "transcripts_json", 6941 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6942 "available": True, 6943 "function_name": "calculation_transcripts_annotation", 6944 "function_params": ["transcripts_json", None], 6945 }, 6946 "transcripts_ann": { 6947 "type": "python", 6948 "name": "transcripts_ann", 6949 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6950 "available": True, 6951 "function_name": "calculation_transcripts_annotation", 6952 "function_params": [None, "transcripts_ann"], 6953 }, 6954 "transcripts_annotations": { 6955 "type": "python", 6956 "name": "transcripts_annotations", 6957 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6958 "available": True, 6959 "function_name": "calculation_transcripts_annotation", 6960 "function_params": [None, None], 6961 }, 6962 "transcripts_prioritization": { 6963 "type": "python", 6964 "name": "transcripts_prioritization", 6965 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6966 "available": True, 6967 "function_name": "calculation_transcripts_prioritization", 6968 "function_params": [], 6969 }, 6970 "transcripts_export": { 6971 "type": "python", 6972 "name": "transcripts_export", 6973 "description": "Export transcripts table/view as a file (using param.json)", 6974 "available": True, 6975 "function_name": "calculation_transcripts_export", 6976 "function_params": [], 6977 }, 6978 }, 6979 "prioritizations": { 6980 "default": { 6981 "ANN2": [ 6982 { 6983 "type": "contains", 6984 "value": "HIGH", 6985 "score": 5, 6986 "flag": "PASS", 6987 "comment": [ 6988 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6989 ], 6990 }, 6991 { 6992 "type": "contains", 6993 "value": "MODERATE", 6994 "score": 3, 6995 "flag": "PASS", 6996 "comment": [ 6997 "A non-disruptive variant that might change protein effectiveness" 6998 ], 6999 }, 7000 { 7001 "type": "contains", 7002 "value": "LOW", 7003 "score": 0, 7004 "flag": "FILTERED", 7005 "comment": [ 7006 "Assumed to be mostly harmless or unlikely to change protein behavior" 7007 ], 7008 }, 7009 { 7010 "type": "contains", 7011 "value": "MODIFIER", 7012 "score": 0, 7013 "flag": "FILTERED", 7014 "comment": [ 7015 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 7016 ], 7017 }, 7018 ], 7019 } 7020 }, 7021 } 7022 7023 return config_default.get(name, None) 7024 7025 def get_config_json( 7026 self, name: str, config_dict: dict = {}, config_file: str = None 7027 ) -> dict: 7028 """ 7029 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 7030 default values, a dictionary, and a file. 7031 7032 :param name: The `name` parameter in the `get_config_json` function is a string that represents 7033 the name of the configuration. It is used to identify and retrieve the configuration settings 7034 for a specific component or module 7035 :type name: str 7036 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 7037 dictionary that allows you to provide additional configuration settings or overrides. When you 7038 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 7039 the key is the configuration setting you want to override or 7040 :type config_dict: dict 7041 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 7042 specify the path to a configuration file that contains additional settings. If provided, the 7043 function will read the contents of this file and update the configuration dictionary with the 7044 values found in the file, overriding any existing values with the 7045 :type config_file: str 7046 :return: The function `get_config_json` returns a dictionary containing the configuration 7047 settings. 7048 """ 7049 7050 # Create with default prioritizations 7051 config_default = self.get_config_default(name=name) 7052 configuration = config_default 7053 # log.debug(f"configuration={configuration}") 7054 7055 # Replace prioritizations from dict 7056 for config in config_dict: 7057 configuration[config] = config_dict[config] 7058 7059 # Replace prioritizations from file 7060 config_file = full_path(config_file) 7061 if config_file: 7062 if os.path.exists(config_file): 7063 with open(config_file) as config_file_content: 7064 config_file_dict = yaml.safe_load(config_file_content) 7065 for config in config_file_dict: 7066 configuration[config] = config_file_dict[config] 7067 else: 7068 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 7069 log.error(msg_error) 7070 raise ValueError(msg_error) 7071 7072 return configuration 7073 7074 def prioritization( 7075 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 7076 ) -> bool: 7077 """ 7078 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 7079 prioritizes variants based on configured profiles and criteria. 7080 7081 :param table: The `table` parameter in the `prioritization` function is used to specify the name 7082 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 7083 a table name is provided, the method will prioritize the variants in that specific table 7084 :type table: str 7085 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 7086 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 7087 provided, the code will use a default prefix value of "PZ" 7088 :type pz_prefix: str 7089 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 7090 additional parameters specific to the prioritization process. These parameters can include 7091 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 7092 configurations needed for the prioritization of variants in a V 7093 :type pz_param: dict 7094 :return: A boolean value (True) is being returned from the `prioritization` function. 7095 """ 7096 7097 # Config 7098 config = self.get_config() 7099 7100 # Param 7101 param = self.get_param() 7102 7103 # Prioritization param 7104 if pz_param is not None: 7105 prioritization_param = pz_param 7106 else: 7107 prioritization_param = param.get("prioritization", {}) 7108 7109 # Configuration profiles 7110 prioritization_config_file = prioritization_param.get( 7111 "prioritization_config", None 7112 ) 7113 prioritization_config_file = full_path(prioritization_config_file) 7114 prioritizations_config = self.get_config_json( 7115 name="prioritizations", config_file=prioritization_config_file 7116 ) 7117 7118 # Prioritization prefix 7119 pz_prefix_default = "PZ" 7120 if pz_prefix is None: 7121 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 7122 7123 # Prioritization options 7124 profiles = prioritization_param.get("profiles", []) 7125 if isinstance(profiles, str): 7126 profiles = profiles.split(",") 7127 pzfields = prioritization_param.get( 7128 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 7129 ) 7130 if isinstance(pzfields, str): 7131 pzfields = pzfields.split(",") 7132 default_profile = prioritization_param.get("default_profile", None) 7133 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 7134 prioritization_score_mode = prioritization_param.get( 7135 "prioritization_score_mode", "HOWARD" 7136 ) 7137 7138 # Quick Prioritizations 7139 prioritizations = param.get("prioritizations", None) 7140 if prioritizations: 7141 log.info("Quick Prioritization:") 7142 for profile in prioritizations.split(","): 7143 if profile not in profiles: 7144 profiles.append(profile) 7145 log.info(f" {profile}") 7146 7147 # If profile "ALL" provided, all profiles in the config profiles 7148 if "ALL" in profiles: 7149 profiles = list(prioritizations_config.keys()) 7150 7151 for profile in profiles: 7152 if prioritizations_config.get(profile, None): 7153 log.debug(f"Profile '{profile}' configured") 7154 else: 7155 msg_error = f"Profile '{profile}' NOT configured" 7156 log.error(msg_error) 7157 raise ValueError(msg_error) 7158 7159 if profiles: 7160 log.info(f"Prioritization... ") 7161 else: 7162 log.debug(f"No profile defined") 7163 return False 7164 7165 if not default_profile and len(profiles): 7166 default_profile = profiles[0] 7167 7168 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 7169 log.debug("Profiles to check: " + str(list(profiles))) 7170 7171 # Variables 7172 if table is not None: 7173 table_variants = table 7174 else: 7175 table_variants = self.get_table_variants(clause="update") 7176 log.debug(f"Table to prioritize: {table_variants}") 7177 7178 # Added columns 7179 added_columns = [] 7180 7181 # Create list of PZfields 7182 # List of PZFields 7183 list_of_pzfields_original = pzfields + [ 7184 pzfield + pzfields_sep + profile 7185 for pzfield in pzfields 7186 for profile in profiles 7187 ] 7188 list_of_pzfields = [] 7189 log.debug(f"{list_of_pzfields_original}") 7190 7191 # Remove existing PZfields to use if exists 7192 for pzfield in list_of_pzfields_original: 7193 if self.get_header().infos.get(pzfield, None) is None: 7194 list_of_pzfields.append(pzfield) 7195 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 7196 else: 7197 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 7198 7199 if list_of_pzfields: 7200 7201 # Explode Infos prefix 7202 explode_infos_prefix = self.get_explode_infos_prefix() 7203 7204 # PZfields tags description 7205 PZfields_INFOS = { 7206 f"{pz_prefix}Tags": { 7207 "ID": f"{pz_prefix}Tags", 7208 "Number": ".", 7209 "Type": "String", 7210 "Description": "Variant tags based on annotation criteria", 7211 }, 7212 f"{pz_prefix}Score": { 7213 "ID": f"{pz_prefix}Score", 7214 "Number": 1, 7215 "Type": "Integer", 7216 "Description": "Variant score based on annotation criteria", 7217 }, 7218 f"{pz_prefix}Flag": { 7219 "ID": f"{pz_prefix}Flag", 7220 "Number": 1, 7221 "Type": "String", 7222 "Description": "Variant flag based on annotation criteria", 7223 }, 7224 f"{pz_prefix}Comment": { 7225 "ID": f"{pz_prefix}Comment", 7226 "Number": ".", 7227 "Type": "String", 7228 "Description": "Variant comment based on annotation criteria", 7229 }, 7230 f"{pz_prefix}Infos": { 7231 "ID": f"{pz_prefix}Infos", 7232 "Number": ".", 7233 "Type": "String", 7234 "Description": "Variant infos based on annotation criteria", 7235 }, 7236 f"{pz_prefix}Class": { 7237 "ID": f"{pz_prefix}Class", 7238 "Number": ".", 7239 "Type": "String", 7240 "Description": "Variant class based on annotation criteria", 7241 }, 7242 } 7243 7244 # Create INFO fields if not exist 7245 for field in PZfields_INFOS: 7246 field_ID = PZfields_INFOS[field]["ID"] 7247 field_description = PZfields_INFOS[field]["Description"] 7248 if field_ID not in self.get_header().infos and field_ID in pzfields: 7249 field_description = ( 7250 PZfields_INFOS[field]["Description"] 7251 + f", profile {default_profile}" 7252 ) 7253 self.get_header().infos[field_ID] = vcf.parser._Info( 7254 field_ID, 7255 PZfields_INFOS[field]["Number"], 7256 PZfields_INFOS[field]["Type"], 7257 field_description, 7258 "unknown", 7259 "unknown", 7260 code_type_map[PZfields_INFOS[field]["Type"]], 7261 ) 7262 7263 # Create INFO fields if not exist for each profile 7264 for profile in prioritizations_config: 7265 if profile in profiles or profiles == []: 7266 for field in PZfields_INFOS: 7267 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 7268 field_description = ( 7269 PZfields_INFOS[field]["Description"] 7270 + f", profile {profile}" 7271 ) 7272 if ( 7273 field_ID not in self.get_header().infos 7274 and field in pzfields 7275 ): 7276 self.get_header().infos[field_ID] = vcf.parser._Info( 7277 field_ID, 7278 PZfields_INFOS[field]["Number"], 7279 PZfields_INFOS[field]["Type"], 7280 field_description, 7281 "unknown", 7282 "unknown", 7283 code_type_map[PZfields_INFOS[field]["Type"]], 7284 ) 7285 7286 # Header 7287 for pzfield in list_of_pzfields: 7288 if re.match(f"{pz_prefix}Score.*", pzfield): 7289 added_column = self.add_column( 7290 table_name=table_variants, 7291 column_name=pzfield, 7292 column_type="INTEGER", 7293 default_value="0", 7294 ) 7295 elif re.match(f"{pz_prefix}Flag.*", pzfield): 7296 added_column = self.add_column( 7297 table_name=table_variants, 7298 column_name=pzfield, 7299 column_type="BOOLEAN", 7300 default_value="1", 7301 ) 7302 elif re.match(f"{pz_prefix}Class.*", pzfield): 7303 added_column = self.add_column( 7304 table_name=table_variants, 7305 column_name=pzfield, 7306 column_type="VARCHAR[]", 7307 default_value="null", 7308 ) 7309 else: 7310 added_column = self.add_column( 7311 table_name=table_variants, 7312 column_name=pzfield, 7313 column_type="STRING", 7314 default_value="''", 7315 ) 7316 added_columns.append(added_column) 7317 7318 # Profiles 7319 if profiles: 7320 7321 # foreach profile in configuration file 7322 for profile in prioritizations_config: 7323 7324 # If profile is asked in param, or ALL are asked (empty profile []) 7325 if profile in profiles or profiles == []: 7326 log.info(f"Profile '{profile}'") 7327 7328 sql_set_info_option = "" 7329 7330 sql_set_info = [] 7331 7332 # PZ fields set 7333 7334 # PZScore 7335 if ( 7336 f"{pz_prefix}Score{pzfields_sep}{profile}" 7337 in list_of_pzfields 7338 ): 7339 sql_set_info.append( 7340 f""" 7341 concat( 7342 '{pz_prefix}Score{pzfields_sep}{profile}=', 7343 {pz_prefix}Score{pzfields_sep}{profile} 7344 ) 7345 """ 7346 ) 7347 if ( 7348 profile == default_profile 7349 and f"{pz_prefix}Score" in list_of_pzfields 7350 ): 7351 sql_set_info.append( 7352 f""" 7353 concat( 7354 '{pz_prefix}Score=', 7355 {pz_prefix}Score{pzfields_sep}{profile} 7356 ) 7357 """ 7358 ) 7359 7360 # PZFlag 7361 if ( 7362 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7363 in list_of_pzfields 7364 ): 7365 sql_set_info.append( 7366 f""" 7367 concat( 7368 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7369 CASE 7370 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7371 THEN 'PASS' 7372 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7373 THEN 'FILTERED' 7374 END 7375 ) 7376 """ 7377 ) 7378 if ( 7379 profile == default_profile 7380 and f"{pz_prefix}Flag" in list_of_pzfields 7381 ): 7382 sql_set_info.append( 7383 f""" 7384 concat( 7385 '{pz_prefix}Flag=', 7386 CASE 7387 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7388 THEN 'PASS' 7389 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7390 THEN 'FILTERED' 7391 END 7392 ) 7393 """ 7394 ) 7395 7396 # PZClass 7397 if ( 7398 f"{pz_prefix}Class{pzfields_sep}{profile}" 7399 in list_of_pzfields 7400 ): 7401 sql_set_info.append( 7402 f""" 7403 concat( 7404 '{pz_prefix}Class{pzfields_sep}{profile}=', 7405 CASE 7406 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7407 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7408 ELSE '.' 7409 END 7410 ) 7411 7412 """ 7413 ) 7414 if ( 7415 profile == default_profile 7416 and f"{pz_prefix}Class" in list_of_pzfields 7417 ): 7418 sql_set_info.append( 7419 f""" 7420 concat( 7421 '{pz_prefix}Class=', 7422 CASE 7423 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7424 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7425 ELSE '.' 7426 END 7427 ) 7428 """ 7429 ) 7430 7431 # PZComment 7432 if ( 7433 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7434 in list_of_pzfields 7435 ): 7436 sql_set_info.append( 7437 f""" 7438 CASE 7439 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7440 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7441 ELSE '' 7442 END 7443 """ 7444 ) 7445 if ( 7446 profile == default_profile 7447 and f"{pz_prefix}Comment" in list_of_pzfields 7448 ): 7449 sql_set_info.append( 7450 f""" 7451 CASE 7452 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7453 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7454 ELSE '' 7455 END 7456 """ 7457 ) 7458 7459 # PZInfos 7460 if ( 7461 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7462 in list_of_pzfields 7463 ): 7464 sql_set_info.append( 7465 f""" 7466 CASE 7467 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7468 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7469 ELSE '' 7470 END 7471 """ 7472 ) 7473 if ( 7474 profile == default_profile 7475 and f"{pz_prefix}Infos" in list_of_pzfields 7476 ): 7477 sql_set_info.append( 7478 f""" 7479 CASE 7480 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7481 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7482 ELSE '' 7483 END 7484 """ 7485 ) 7486 7487 # Merge PZfields 7488 sql_set_info_option = "" 7489 sql_set_sep = "" 7490 for sql_set in sql_set_info: 7491 if sql_set_sep: 7492 sql_set_info_option += f""" 7493 , concat('{sql_set_sep}', {sql_set}) 7494 """ 7495 else: 7496 sql_set_info_option += f""" 7497 , {sql_set} 7498 """ 7499 sql_set_sep = ";" 7500 7501 sql_queries = [] 7502 for annotation in prioritizations_config[profile]: 7503 7504 # skip special sections 7505 if annotation.startswith("_"): 7506 continue 7507 7508 # For each criterions 7509 for criterion in prioritizations_config[profile][ 7510 annotation 7511 ]: 7512 7513 # Criterion mode 7514 criterion_mode = None 7515 if np.any( 7516 np.isin(list(criterion.keys()), ["type", "value"]) 7517 ): 7518 criterion_mode = "operation" 7519 elif np.any( 7520 np.isin(list(criterion.keys()), ["sql", "fields"]) 7521 ): 7522 criterion_mode = "sql" 7523 log.debug(f"Criterion Mode: {criterion_mode}") 7524 7525 # Criterion parameters 7526 criterion_type = criterion.get("type", None) 7527 criterion_value = criterion.get("value", None) 7528 criterion_sql = criterion.get("sql", None) 7529 criterion_fields = criterion.get("fields", None) 7530 criterion_score = criterion.get("score", 0) 7531 criterion_flag = criterion.get("flag", "PASS") 7532 criterion_class = criterion.get("class", None) 7533 criterion_flag_bool = criterion_flag == "PASS" 7534 criterion_comment = ( 7535 ", ".join(criterion.get("comment", [])) 7536 .replace("'", "''") 7537 .replace(";", ",") 7538 .replace("\t", " ") 7539 ) 7540 criterion_infos = ( 7541 str(criterion) 7542 .replace("'", "''") 7543 .replace(";", ",") 7544 .replace("\t", " ") 7545 ) 7546 7547 # SQL 7548 if criterion_sql is not None and isinstance( 7549 criterion_sql, list 7550 ): 7551 criterion_sql = " ".join(criterion_sql) 7552 7553 # Fields and explode 7554 if criterion_fields is None: 7555 criterion_fields = [annotation] 7556 if not isinstance(criterion_fields, list): 7557 criterion_fields = str(criterion_fields).split(",") 7558 7559 # Class 7560 if criterion_class is not None and not isinstance( 7561 criterion_class, list 7562 ): 7563 criterion_class = str(criterion_class).split(",") 7564 7565 for annotation_field in criterion_fields: 7566 7567 # Explode specific annotation 7568 log.debug( 7569 f"Explode annotation '{annotation_field}'" 7570 ) 7571 added_columns += self.explode_infos( 7572 prefix=explode_infos_prefix, 7573 fields=[annotation_field], 7574 table=table_variants, 7575 ) 7576 extra_infos = self.get_extra_infos( 7577 table=table_variants 7578 ) 7579 7580 # Check if annotation field is present 7581 if ( 7582 f"{explode_infos_prefix}{annotation_field}" 7583 not in extra_infos 7584 ): 7585 msq_err = f"Annotation '{annotation_field}' not in data" 7586 log.error(msq_err) 7587 raise ValueError(msq_err) 7588 else: 7589 log.debug( 7590 f"Annotation '{annotation_field}' in data" 7591 ) 7592 7593 sql_set = [] 7594 sql_set_info = [] 7595 7596 # PZ fields set 7597 7598 # PZScore 7599 if ( 7600 f"{pz_prefix}Score{pzfields_sep}{profile}" 7601 in list_of_pzfields 7602 ): 7603 # VaRank prioritization score mode 7604 if prioritization_score_mode.upper().strip() in [ 7605 "VARANK", 7606 "MAX", 7607 "MAXIMUM", 7608 "TOP", 7609 ]: 7610 sql_set.append( 7611 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END " 7612 ) 7613 # default HOWARD prioritization score mode 7614 else: 7615 sql_set.append( 7616 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7617 ) 7618 7619 # PZFlag 7620 if ( 7621 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7622 in list_of_pzfields 7623 ): 7624 sql_set.append( 7625 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7626 ) 7627 7628 # PZClass 7629 if ( 7630 f"{pz_prefix}Class{pzfields_sep}{profile}" 7631 in list_of_pzfields 7632 and criterion_class is not None 7633 ): 7634 sql_set.append( 7635 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7636 ) 7637 7638 # PZComment 7639 if ( 7640 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7641 in list_of_pzfields 7642 ): 7643 sql_set.append( 7644 f""" 7645 {pz_prefix}Comment{pzfields_sep}{profile} = 7646 concat( 7647 {pz_prefix}Comment{pzfields_sep}{profile}, 7648 CASE 7649 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7650 THEN ', ' 7651 ELSE '' 7652 END, 7653 '{criterion_comment}' 7654 ) 7655 """ 7656 ) 7657 7658 # PZInfos 7659 if ( 7660 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7661 in list_of_pzfields 7662 ): 7663 sql_set.append( 7664 f""" 7665 {pz_prefix}Infos{pzfields_sep}{profile} = 7666 concat( 7667 {pz_prefix}Infos{pzfields_sep}{profile}, 7668 '{criterion_infos}' 7669 ) 7670 """ 7671 ) 7672 sql_set_option = ",".join(sql_set) 7673 7674 # Criterion and comparison 7675 if sql_set_option: 7676 7677 if criterion_mode in ["operation"]: 7678 7679 try: 7680 float(criterion_value) 7681 sql_update = f""" 7682 UPDATE {table_variants} 7683 SET {sql_set_option} 7684 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7685 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7686 """ 7687 except: 7688 contains_option = "" 7689 if criterion_type == "contains": 7690 contains_option = ".*" 7691 sql_update = f""" 7692 UPDATE {table_variants} 7693 SET {sql_set_option} 7694 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7695 """ 7696 sql_queries.append(sql_update) 7697 7698 elif criterion_mode in ["sql"]: 7699 7700 sql_update = f""" 7701 UPDATE {table_variants} 7702 SET {sql_set_option} 7703 WHERE {criterion_sql} 7704 """ 7705 sql_queries.append(sql_update) 7706 7707 else: 7708 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7709 log.error(msg_err) 7710 raise ValueError(msg_err) 7711 7712 else: 7713 log.warning( 7714 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7715 ) 7716 7717 # PZTags 7718 if ( 7719 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7720 in list_of_pzfields 7721 ): 7722 7723 # Create PZFalgs value 7724 pztags_value = "" 7725 pztags_sep_default = "," 7726 pztags_sep = "" 7727 for pzfield in pzfields: 7728 if pzfield not in [f"{pz_prefix}Tags"]: 7729 if ( 7730 f"{pzfield}{pzfields_sep}{profile}" 7731 in list_of_pzfields 7732 ): 7733 if pzfield in [f"{pz_prefix}Flag"]: 7734 pztags_value += f"""{pztags_sep}{pzfield}#', 7735 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7736 THEN 'PASS' 7737 ELSE 'FILTERED' 7738 END, '""" 7739 elif pzfield in [f"{pz_prefix}Class"]: 7740 pztags_value += f"""{pztags_sep}{pzfield}#', 7741 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7742 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7743 ELSE '.' 7744 END, '""" 7745 else: 7746 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7747 pztags_sep = pztags_sep_default 7748 7749 # Add Query update for PZFlags 7750 sql_update_pztags = f""" 7751 UPDATE {table_variants} 7752 SET INFO = concat( 7753 INFO, 7754 CASE WHEN INFO NOT in ('','.') 7755 THEN ';' 7756 ELSE '' 7757 END, 7758 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7759 ) 7760 """ 7761 sql_queries.append(sql_update_pztags) 7762 7763 # Add Query update for PZFlags for default 7764 if profile == default_profile: 7765 sql_update_pztags_default = f""" 7766 UPDATE {table_variants} 7767 SET INFO = concat( 7768 INFO, 7769 ';', 7770 '{pz_prefix}Tags={pztags_value}' 7771 ) 7772 """ 7773 sql_queries.append(sql_update_pztags_default) 7774 7775 log.info(f"""Profile '{profile}' - Prioritization... """) 7776 7777 if sql_queries: 7778 7779 for sql_query in sql_queries: 7780 log.debug( 7781 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7782 ) 7783 self.conn.execute(sql_query) 7784 7785 log.info(f"""Profile '{profile}' - Update... """) 7786 sql_query_update = f""" 7787 UPDATE {table_variants} 7788 SET INFO = 7789 concat( 7790 CASE 7791 WHEN INFO NOT IN ('','.') 7792 THEN concat(INFO, ';') 7793 ELSE '' 7794 END 7795 {sql_set_info_option} 7796 ) 7797 """ 7798 self.conn.execute(sql_query_update) 7799 7800 else: 7801 7802 log.warning(f"No profiles in parameters") 7803 7804 # Remove added columns 7805 for added_column in added_columns: 7806 self.drop_column(column=added_column) 7807 7808 # Explode INFOS fields into table fields 7809 if self.get_explode_infos(): 7810 self.explode_infos( 7811 prefix=self.get_explode_infos_prefix(), 7812 fields=self.get_explode_infos_fields(), 7813 force=True, 7814 ) 7815 7816 return True 7817 7818 ### 7819 # HGVS 7820 ### 7821 7822 def annotation_hgvs(self, threads: int = None) -> None: 7823 """ 7824 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7825 coordinates and alleles. 7826 7827 :param threads: The `threads` parameter is an optional integer that specifies the number of 7828 threads to use for parallel processing. If no value is provided, it will default to the number 7829 of threads obtained from the `get_threads()` method 7830 :type threads: int 7831 """ 7832 7833 # Function for each partition of the Dask Dataframe 7834 def partition_function(partition): 7835 """ 7836 The function `partition_function` applies the `annotation_hgvs_partition` function to 7837 each row of a DataFrame called `partition`. 7838 7839 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7840 to be processed 7841 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7842 the "partition" dataframe along the axis 1. 7843 """ 7844 return partition.apply(annotation_hgvs_partition, axis=1) 7845 7846 def annotation_hgvs_partition(row) -> str: 7847 """ 7848 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7849 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7850 7851 :param row: A dictionary-like object that contains the values for the following keys: 7852 :return: a string that contains the HGVS names associated with the given row of data. 7853 """ 7854 7855 chr = row["CHROM"] 7856 pos = row["POS"] 7857 ref = row["REF"] 7858 alt = row["ALT"] 7859 7860 # Find list of associated transcripts 7861 transcripts_list = list( 7862 polars_conn.execute( 7863 f""" 7864 SELECT transcript 7865 FROM refseq_df 7866 WHERE CHROM='{chr}' 7867 AND POS={pos} 7868 """ 7869 )["transcript"] 7870 ) 7871 7872 # Full HGVS annotation in list 7873 hgvs_full_list = [] 7874 7875 for transcript_name in transcripts_list: 7876 7877 # Transcript 7878 transcript = get_transcript( 7879 transcripts=transcripts, transcript_name=transcript_name 7880 ) 7881 # Exon 7882 if use_exon: 7883 exon = transcript.find_exon_number(pos) 7884 else: 7885 exon = None 7886 # Protein 7887 transcript_protein = None 7888 if use_protein or add_protein or full_format: 7889 transcripts_protein = list( 7890 polars_conn.execute( 7891 f""" 7892 SELECT protein 7893 FROM refseqlink_df 7894 WHERE transcript='{transcript_name}' 7895 LIMIT 1 7896 """ 7897 )["protein"] 7898 ) 7899 if len(transcripts_protein): 7900 transcript_protein = transcripts_protein[0] 7901 7902 # HGVS name 7903 hgvs_name = format_hgvs_name( 7904 chr, 7905 pos, 7906 ref, 7907 alt, 7908 genome=genome, 7909 transcript=transcript, 7910 transcript_protein=transcript_protein, 7911 exon=exon, 7912 use_gene=use_gene, 7913 use_protein=use_protein, 7914 full_format=full_format, 7915 use_version=use_version, 7916 codon_type=codon_type, 7917 ) 7918 hgvs_full_list.append(hgvs_name) 7919 if add_protein and not use_protein and not full_format: 7920 hgvs_name = format_hgvs_name( 7921 chr, 7922 pos, 7923 ref, 7924 alt, 7925 genome=genome, 7926 transcript=transcript, 7927 transcript_protein=transcript_protein, 7928 exon=exon, 7929 use_gene=use_gene, 7930 use_protein=True, 7931 full_format=False, 7932 use_version=use_version, 7933 codon_type=codon_type, 7934 ) 7935 hgvs_full_list.append(hgvs_name) 7936 7937 # Create liste of HGVS annotations 7938 hgvs_full = ",".join(hgvs_full_list) 7939 7940 return hgvs_full 7941 7942 # Polars connexion 7943 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7944 7945 # Config 7946 config = self.get_config() 7947 7948 # Databases 7949 # Genome 7950 databases_genomes_folders = ( 7951 config.get("folders", {}) 7952 .get("databases", {}) 7953 .get("genomes", DEFAULT_GENOME_FOLDER) 7954 ) 7955 databases_genome = ( 7956 config.get("folders", {}).get("databases", {}).get("genomes", "") 7957 ) 7958 # refseq database folder 7959 databases_refseq_folders = ( 7960 config.get("folders", {}) 7961 .get("databases", {}) 7962 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7963 ) 7964 # refseq 7965 databases_refseq = config.get("databases", {}).get("refSeq", None) 7966 # refSeqLink 7967 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7968 7969 # Param 7970 param = self.get_param() 7971 7972 # Quick HGVS 7973 if "hgvs_options" in param and param.get("hgvs_options", ""): 7974 log.info(f"Quick HGVS Annotation:") 7975 if not param.get("hgvs", None): 7976 param["hgvs"] = {} 7977 for option in param.get("hgvs_options", "").split(","): 7978 option_var_val = option.split("=") 7979 option_var = option_var_val[0] 7980 if len(option_var_val) > 1: 7981 option_val = option_var_val[1] 7982 else: 7983 option_val = "True" 7984 if option_val.upper() in ["TRUE"]: 7985 option_val = True 7986 elif option_val.upper() in ["FALSE"]: 7987 option_val = False 7988 log.info(f" {option_var}={option_val}") 7989 param["hgvs"][option_var] = option_val 7990 7991 # Check if HGVS annotation enabled 7992 if "hgvs" in param: 7993 log.info(f"HGVS Annotation... ") 7994 for hgvs_option in param.get("hgvs", {}): 7995 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7996 else: 7997 return 7998 7999 # HGVS Param 8000 param_hgvs = param.get("hgvs", {}) 8001 use_exon = param_hgvs.get("use_exon", False) 8002 use_gene = param_hgvs.get("use_gene", False) 8003 use_protein = param_hgvs.get("use_protein", False) 8004 add_protein = param_hgvs.get("add_protein", False) 8005 full_format = param_hgvs.get("full_format", False) 8006 use_version = param_hgvs.get("use_version", False) 8007 codon_type = param_hgvs.get("codon_type", "3") 8008 8009 # refSseq refSeqLink 8010 databases_refseq = param_hgvs.get("refseq", databases_refseq) 8011 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 8012 8013 # Assembly 8014 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 8015 8016 # Genome 8017 genome_file = None 8018 if find_genome(databases_genome): 8019 genome_file = find_genome(databases_genome) 8020 else: 8021 genome_file = find_genome( 8022 genome_path=databases_genomes_folders, assembly=assembly 8023 ) 8024 log.debug("Genome: " + str(genome_file)) 8025 8026 # refSseq 8027 refseq_file = find_file_prefix( 8028 input_file=databases_refseq, 8029 prefix="ncbiRefSeq", 8030 folder=databases_refseq_folders, 8031 assembly=assembly, 8032 ) 8033 log.debug("refSeq: " + str(refseq_file)) 8034 8035 # refSeqLink 8036 refseqlink_file = find_file_prefix( 8037 input_file=databases_refseqlink, 8038 prefix="ncbiRefSeqLink", 8039 folder=databases_refseq_folders, 8040 assembly=assembly, 8041 ) 8042 log.debug("refSeqLink: " + str(refseqlink_file)) 8043 8044 # Threads 8045 if not threads: 8046 threads = self.get_threads() 8047 log.debug("Threads: " + str(threads)) 8048 8049 # Variables 8050 table_variants = self.get_table_variants(clause="update") 8051 8052 # Get variants SNV and InDel only 8053 query_variants = f""" 8054 SELECT "#CHROM" AS CHROM, POS, REF, ALT 8055 FROM {table_variants} 8056 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 8057 """ 8058 df_variants = self.get_query_to_df(query_variants) 8059 8060 # Added columns 8061 added_columns = [] 8062 8063 # Add hgvs column in variants table 8064 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 8065 added_column = self.add_column( 8066 table_variants, hgvs_column_name, "STRING", default_value=None 8067 ) 8068 added_columns.append(added_column) 8069 8070 log.debug(f"refSeq loading...") 8071 # refSeq in duckDB 8072 refseq_table = get_refseq_table( 8073 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 8074 ) 8075 # Loading all refSeq in Dataframe 8076 refseq_query = f""" 8077 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 8078 FROM {refseq_table} 8079 JOIN df_variants ON ( 8080 {refseq_table}.chrom = df_variants.CHROM 8081 AND {refseq_table}.txStart<=df_variants.POS 8082 AND {refseq_table}.txEnd>=df_variants.POS 8083 ) 8084 """ 8085 refseq_df = self.conn.query(refseq_query).pl() 8086 8087 if refseqlink_file: 8088 log.debug(f"refSeqLink loading...") 8089 # refSeqLink in duckDB 8090 refseqlink_table = get_refseq_table( 8091 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 8092 ) 8093 # Loading all refSeqLink in Dataframe 8094 protacc_column = "protAcc_with_ver" 8095 mrnaacc_column = "mrnaAcc_with_ver" 8096 refseqlink_query = f""" 8097 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 8098 FROM {refseqlink_table} 8099 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 8100 WHERE protAcc_without_ver IS NOT NULL 8101 """ 8102 # Polars Dataframe 8103 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 8104 8105 # Read RefSeq transcripts into a python dict/model. 8106 log.debug(f"Transcripts loading...") 8107 with tempfile.TemporaryDirectory() as tmpdir: 8108 transcripts_query = f""" 8109 COPY ( 8110 SELECT {refseq_table}.* 8111 FROM {refseq_table} 8112 JOIN df_variants ON ( 8113 {refseq_table}.chrom=df_variants.CHROM 8114 AND {refseq_table}.txStart<=df_variants.POS 8115 AND {refseq_table}.txEnd>=df_variants.POS 8116 ) 8117 ) 8118 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 8119 """ 8120 self.conn.query(transcripts_query) 8121 with open(f"{tmpdir}/transcript.tsv") as infile: 8122 transcripts = read_transcripts(infile) 8123 8124 # Polars connexion 8125 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8126 8127 log.debug("Genome loading...") 8128 # Read genome sequence using pyfaidx. 8129 genome = Fasta(genome_file) 8130 8131 log.debug("Start annotation HGVS...") 8132 8133 # Create 8134 # a Dask Dataframe from Pandas dataframe with partition as number of threads 8135 ddf = dd.from_pandas(df_variants, npartitions=threads) 8136 8137 # Use dask.dataframe.apply() to apply function on each partition 8138 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 8139 8140 # Convert Dask DataFrame to Pandas Dataframe 8141 df = ddf.compute() 8142 8143 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 8144 with tempfile.TemporaryDirectory() as tmpdir: 8145 df_parquet = os.path.join(tmpdir, "df.parquet") 8146 df.to_parquet(df_parquet) 8147 8148 # Update hgvs column 8149 update_variant_query = f""" 8150 UPDATE {table_variants} 8151 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 8152 FROM read_parquet('{df_parquet}') as df 8153 WHERE variants."#CHROM" = df.CHROM 8154 AND variants.POS = df.POS 8155 AND variants.REF = df.REF 8156 AND variants.ALT = df.ALT 8157 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 8158 """ 8159 self.execute_query(update_variant_query) 8160 8161 # Update INFO column 8162 sql_query_update = f""" 8163 UPDATE {table_variants} 8164 SET INFO = 8165 concat( 8166 CASE 8167 WHEN INFO NOT IN ('','.') 8168 THEN concat(INFO, ';') 8169 ELSE '' 8170 END, 8171 'hgvs=', 8172 {hgvs_column_name} 8173 ) 8174 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 8175 """ 8176 self.execute_query(sql_query_update) 8177 8178 # Add header 8179 HGVS_INFOS = { 8180 "hgvs": { 8181 "ID": "hgvs", 8182 "Number": ".", 8183 "Type": "String", 8184 "Description": f"HGVS annotatation with HOWARD", 8185 } 8186 } 8187 8188 for field in HGVS_INFOS: 8189 field_ID = HGVS_INFOS[field]["ID"] 8190 field_description = HGVS_INFOS[field]["Description"] 8191 self.get_header().infos[field_ID] = vcf.parser._Info( 8192 field_ID, 8193 HGVS_INFOS[field]["Number"], 8194 HGVS_INFOS[field]["Type"], 8195 field_description, 8196 "unknown", 8197 "unknown", 8198 code_type_map[HGVS_INFOS[field]["Type"]], 8199 ) 8200 8201 # Remove added columns 8202 for added_column in added_columns: 8203 self.drop_column(column=added_column) 8204 8205 ### 8206 # Calculation 8207 ### 8208 8209 def get_operations_help( 8210 self, operations_config_dict: dict = {}, operations_config_file: str = None 8211 ) -> list: 8212 8213 # Init 8214 operations_help = [] 8215 8216 # operations 8217 operations = self.get_config_json( 8218 name="calculations", 8219 config_dict=operations_config_dict, 8220 config_file=operations_config_file, 8221 ) 8222 for op in operations: 8223 op_name = operations[op].get("name", op).upper() 8224 op_description = operations[op].get("description", op_name) 8225 op_available = operations[op].get("available", False) 8226 if op_available: 8227 operations_help.append(f" {op_name}: {op_description}") 8228 8229 # Sort operations 8230 operations_help.sort() 8231 8232 # insert header 8233 operations_help.insert(0, "Available calculation operations:") 8234 8235 # Return 8236 return operations_help 8237 8238 def calculation( 8239 self, 8240 operations: dict = {}, 8241 operations_config_dict: dict = {}, 8242 operations_config_file: str = None, 8243 ) -> None: 8244 """ 8245 It takes a list of operations, and for each operation, it checks if it's a python or sql 8246 operation, and then calls the appropriate function 8247 8248 param json example: 8249 "calculation": { 8250 "NOMEN": { 8251 "options": { 8252 "hgvs_field": "hgvs" 8253 }, 8254 "middle" : null 8255 } 8256 """ 8257 8258 # Param 8259 param = self.get_param() 8260 8261 # CHeck operations config file 8262 if operations_config_file is None: 8263 operations_config_file = param.get("calculation", {}).get( 8264 "calculation_config", None 8265 ) 8266 8267 # operations config 8268 operations_config = self.get_config_json( 8269 name="calculations", 8270 config_dict=operations_config_dict, 8271 config_file=operations_config_file, 8272 ) 8273 8274 # Upper keys 8275 operations_config = {k.upper(): v for k, v in operations_config.items()} 8276 8277 # Calculations 8278 8279 # Operations from param 8280 operations = param.get("calculation", {}).get("calculations", operations) 8281 8282 # Quick calculation - add 8283 if param.get("calculations", None): 8284 8285 # List of operations 8286 calculations_list = [ 8287 value.strip() for value in param.get("calculations", "").split(",") 8288 ] 8289 8290 # Log 8291 log.info(f"Quick Calculations:") 8292 for calculation_key in calculations_list: 8293 log.info(f" {calculation_key}") 8294 8295 # Create tmp operations (to keep operation order) 8296 operations_tmp = {} 8297 for calculation_operation in calculations_list: 8298 if calculation_operation.upper() not in operations_tmp: 8299 log.debug( 8300 f"{calculation_operation}.upper() not in {operations_tmp}" 8301 ) 8302 operations_tmp[calculation_operation.upper()] = {} 8303 add_value_into_dict( 8304 dict_tree=operations_tmp, 8305 sections=[ 8306 calculation_operation.upper(), 8307 ], 8308 value=operations.get(calculation_operation.upper(), {}), 8309 ) 8310 # Add operations already in param 8311 for calculation_operation in operations: 8312 if calculation_operation not in operations_tmp: 8313 operations_tmp[calculation_operation] = operations.get( 8314 calculation_operation, {} 8315 ) 8316 8317 # Update operations in param 8318 operations = operations_tmp 8319 8320 # Operations for calculation 8321 if not operations: 8322 operations = param.get("calculation", {}).get("calculations", {}) 8323 8324 if operations: 8325 log.info(f"Calculations...") 8326 8327 # For each operations 8328 for operation_name in operations: 8329 operation_name = operation_name.upper() 8330 if operation_name not in [""]: 8331 if operation_name in operations_config: 8332 log.info(f"Calculation '{operation_name}'") 8333 operation = operations_config[operation_name] 8334 operation_type = operation.get("type", "sql") 8335 if operation_type == "python": 8336 self.calculation_process_function( 8337 operation=operation, operation_name=operation_name 8338 ) 8339 elif operation_type == "sql": 8340 self.calculation_process_sql( 8341 operation=operation, operation_name=operation_name 8342 ) 8343 else: 8344 log.error( 8345 f"Operations config: Type '{operation_type}' NOT available" 8346 ) 8347 raise ValueError( 8348 f"Operations config: Type '{operation_type}' NOT available" 8349 ) 8350 else: 8351 log.error( 8352 f"Operations config: Calculation '{operation_name}' NOT available" 8353 ) 8354 raise ValueError( 8355 f"Operations config: Calculation '{operation_name}' NOT available" 8356 ) 8357 8358 # Explode INFOS fields into table fields 8359 if self.get_explode_infos(): 8360 self.explode_infos( 8361 prefix=self.get_explode_infos_prefix(), 8362 fields=self.get_explode_infos_fields(), 8363 force=True, 8364 ) 8365 8366 def calculation_process_sql( 8367 self, operation: dict, operation_name: str = "unknown" 8368 ) -> None: 8369 """ 8370 The `calculation_process_sql` function takes in a mathematical operation as a string and 8371 performs the operation, updating the specified table with the result. 8372 8373 :param operation: The `operation` parameter is a dictionary that contains information about the 8374 mathematical operation to be performed. It includes the following keys: 8375 :type operation: dict 8376 :param operation_name: The `operation_name` parameter is a string that represents the name of 8377 the mathematical operation being performed. It is used for logging and error handling purposes, 8378 defaults to unknown 8379 :type operation_name: str (optional) 8380 """ 8381 8382 # Operation infos 8383 operation_name = operation.get("name", "unknown") 8384 log.debug(f"process SQL {operation_name}") 8385 output_column_name = operation.get("output_column_name", operation_name) 8386 output_column_type = operation.get("output_column_type", "String") 8387 prefix = operation.get("explode_infos_prefix", "") 8388 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8389 output_column_description = operation.get( 8390 "output_column_description", f"{operation_name} operation" 8391 ) 8392 operation_query = operation.get("operation_query", None) 8393 if isinstance(operation_query, list): 8394 operation_query = " ".join(operation_query) 8395 operation_info_fields = operation.get("info_fields", []) 8396 operation_info_fields_check = operation.get("info_fields_check", False) 8397 operation_info = operation.get("operation_info", True) 8398 operation_table = operation.get( 8399 "table", self.get_table_variants(clause="alter") 8400 ) 8401 8402 # table variants 8403 if operation_table: 8404 table_variants = operation_table 8405 else: 8406 table_variants = self.get_table_variants(clause="alter") 8407 8408 if operation_query: 8409 8410 # Info fields check 8411 operation_info_fields_check_result = True 8412 if operation_info_fields_check: 8413 header_infos = self.get_header().infos 8414 for info_field in operation_info_fields: 8415 operation_info_fields_check_result = ( 8416 operation_info_fields_check_result 8417 and info_field in header_infos 8418 ) 8419 8420 # If info fields available 8421 if operation_info_fields_check_result: 8422 8423 # Added_columns 8424 added_columns = [] 8425 8426 # Create VCF header field 8427 vcf_reader = self.get_header() 8428 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8429 output_column_name, 8430 ".", 8431 output_column_type, 8432 output_column_description, 8433 "howard calculation", 8434 "0", 8435 self.code_type_map.get(output_column_type), 8436 ) 8437 8438 # Explode infos if needed 8439 log.debug(f"calculation_process_sql prefix {prefix}") 8440 added_columns += self.explode_infos( 8441 prefix=prefix, 8442 fields=[output_column_name] + operation_info_fields, 8443 force=False, 8444 table=table_variants, 8445 ) 8446 8447 # Create column 8448 added_column = self.add_column( 8449 table_name=table_variants, 8450 column_name=prefix + output_column_name, 8451 column_type=output_column_type_sql, 8452 default_value="null", 8453 ) 8454 added_columns.append(added_column) 8455 8456 # Operation calculation 8457 try: 8458 8459 # Query to update calculation column 8460 sql_update = f""" 8461 UPDATE {table_variants} 8462 SET "{prefix}{output_column_name}" = ({operation_query}) 8463 """ 8464 self.conn.execute(sql_update) 8465 8466 # Add to INFO 8467 if operation_info: 8468 sql_update_info = f""" 8469 UPDATE {table_variants} 8470 SET "INFO" = 8471 concat( 8472 CASE 8473 WHEN "INFO" IS NOT NULL 8474 THEN concat("INFO", ';') 8475 ELSE '' 8476 END, 8477 '{output_column_name}=', 8478 "{prefix}{output_column_name}" 8479 ) 8480 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8481 """ 8482 self.conn.execute(sql_update_info) 8483 8484 except: 8485 log.error( 8486 f"Operations config: Calculation '{operation_name}' query failed" 8487 ) 8488 raise ValueError( 8489 f"Operations config: Calculation '{operation_name}' query failed" 8490 ) 8491 8492 # Remove added columns 8493 for added_column in added_columns: 8494 log.debug(f"added_column: {added_column}") 8495 self.drop_column(column=added_column) 8496 8497 else: 8498 log.error( 8499 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8500 ) 8501 raise ValueError( 8502 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8503 ) 8504 8505 else: 8506 log.error( 8507 f"Operations config: Calculation '{operation_name}' query NOT defined" 8508 ) 8509 raise ValueError( 8510 f"Operations config: Calculation '{operation_name}' query NOT defined" 8511 ) 8512 8513 def calculation_process_function( 8514 self, operation: dict, operation_name: str = "unknown" 8515 ) -> None: 8516 """ 8517 The `calculation_process_function` takes in an operation dictionary and performs the specified 8518 function with the given parameters. 8519 8520 :param operation: The `operation` parameter is a dictionary that contains information about the 8521 operation to be performed. It has the following keys: 8522 :type operation: dict 8523 :param operation_name: The `operation_name` parameter is a string that represents the name of 8524 the operation being performed. It is used for logging purposes, defaults to unknown 8525 :type operation_name: str (optional) 8526 """ 8527 8528 operation_name = operation["name"] 8529 log.debug(f"process Python {operation_name}") 8530 function_name = operation["function_name"] 8531 function_params = operation["function_params"] 8532 getattr(self, function_name)(*function_params) 8533 8534 def calculation_variant_id(self) -> None: 8535 """ 8536 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8537 updates the INFO field of a variants table with the variant ID. 8538 """ 8539 8540 # variant_id annotation field 8541 variant_id_tag = self.get_variant_id_column() 8542 added_columns = [variant_id_tag] 8543 8544 # variant_id hgvs tags" 8545 vcf_infos_tags = { 8546 variant_id_tag: "howard variant ID annotation", 8547 } 8548 8549 # Variants table 8550 table_variants = self.get_table_variants() 8551 8552 # Header 8553 vcf_reader = self.get_header() 8554 8555 # Add variant_id to header 8556 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8557 variant_id_tag, 8558 ".", 8559 "String", 8560 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8561 "howard calculation", 8562 "0", 8563 self.code_type_map.get("String"), 8564 ) 8565 8566 # Update 8567 sql_update = f""" 8568 UPDATE {table_variants} 8569 SET "INFO" = 8570 concat( 8571 CASE 8572 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8573 THEN '' 8574 ELSE concat("INFO", ';') 8575 END, 8576 '{variant_id_tag}=', 8577 "{variant_id_tag}" 8578 ) 8579 """ 8580 self.conn.execute(sql_update) 8581 8582 # Remove added columns 8583 for added_column in added_columns: 8584 self.drop_column(column=added_column) 8585 8586 def calculation_extract_snpeff_hgvs( 8587 self, 8588 snpeff_hgvs: str = "snpeff_hgvs", 8589 snpeff_field: str = "ANN", 8590 ) -> None: 8591 """ 8592 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8593 annotation field in a VCF file and adds them as a new column in the variants table. 8594 8595 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8596 function is used to specify the name of the column that will store the HGVS nomenclatures 8597 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8598 snpeff_hgvs 8599 :type snpeff_hgvs: str (optional) 8600 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8601 function represents the field in the VCF file that contains SnpEff annotations. This field is 8602 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8603 to ANN 8604 :type snpeff_field: str (optional) 8605 """ 8606 8607 # Snpeff hgvs tags 8608 vcf_infos_tags = { 8609 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8610 } 8611 8612 # Prefix 8613 prefix = self.get_explode_infos_prefix() 8614 if prefix: 8615 prefix = "INFO/" 8616 8617 # snpEff fields 8618 speff_ann_infos = prefix + snpeff_field 8619 speff_hgvs_infos = prefix + snpeff_hgvs 8620 8621 # Variants table 8622 table_variants = self.get_table_variants() 8623 8624 # Header 8625 vcf_reader = self.get_header() 8626 8627 # Add columns 8628 added_columns = [] 8629 8630 # Explode HGVS field in column 8631 added_columns += self.explode_infos(fields=[snpeff_field]) 8632 8633 if snpeff_field in vcf_reader.infos: 8634 8635 log.debug(vcf_reader.infos[snpeff_field]) 8636 8637 # Extract ANN header 8638 ann_description = vcf_reader.infos[snpeff_field].desc 8639 pattern = r"'(.+?)'" 8640 match = re.search(pattern, ann_description) 8641 if match: 8642 ann_header_match = match.group(1).split(" | ") 8643 ann_header_desc = {} 8644 for i in range(len(ann_header_match)): 8645 ann_header_info = "".join( 8646 char for char in ann_header_match[i] if char.isalnum() 8647 ) 8648 ann_header_desc[ann_header_info] = ann_header_match[i] 8649 if not ann_header_desc: 8650 raise ValueError("Invalid header description format") 8651 else: 8652 raise ValueError("Invalid header description format") 8653 8654 # Create variant id 8655 variant_id_column = self.get_variant_id_column() 8656 added_columns += [variant_id_column] 8657 8658 # Create dataframe 8659 dataframe_snpeff_hgvs = self.get_query_to_df( 8660 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8661 ) 8662 8663 # Create main NOMEN column 8664 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8665 speff_ann_infos 8666 ].apply( 8667 lambda x: extract_snpeff_hgvs( 8668 str(x), header=list(ann_header_desc.values()) 8669 ) 8670 ) 8671 8672 # Add snpeff_hgvs to header 8673 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8674 snpeff_hgvs, 8675 ".", 8676 "String", 8677 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8678 "howard calculation", 8679 "0", 8680 self.code_type_map.get("String"), 8681 ) 8682 8683 # Update 8684 sql_update = f""" 8685 UPDATE variants 8686 SET "INFO" = 8687 concat( 8688 CASE 8689 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8690 THEN '' 8691 ELSE concat("INFO", ';') 8692 END, 8693 CASE 8694 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8695 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8696 THEN concat( 8697 '{snpeff_hgvs}=', 8698 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8699 ) 8700 ELSE '' 8701 END 8702 ) 8703 FROM dataframe_snpeff_hgvs 8704 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8705 8706 """ 8707 self.conn.execute(sql_update) 8708 8709 # Delete dataframe 8710 del dataframe_snpeff_hgvs 8711 gc.collect() 8712 8713 else: 8714 8715 log.warning( 8716 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8717 ) 8718 8719 # Remove added columns 8720 for added_column in added_columns: 8721 self.drop_column(column=added_column) 8722 8723 def calculation_snpeff_ann_explode( 8724 self, 8725 uniquify: bool = True, 8726 output_format: str = "fields", 8727 output_prefix: str = "snpeff_", 8728 snpeff_field: str = "ANN", 8729 ) -> None: 8730 """ 8731 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8732 exploding the HGVS field and updating variant information accordingly. 8733 8734 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8735 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8736 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8737 defaults to True 8738 :type uniquify: bool (optional) 8739 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8740 function specifies the format in which the output annotations will be generated. It has a 8741 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8742 format, defaults to fields 8743 :type output_format: str (optional) 8744 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8745 method is used to specify the prefix that will be added to the output annotations generated 8746 during the calculation process. This prefix helps to differentiate the newly added annotations 8747 from existing ones in the output data. By default, the, defaults to ANN_ 8748 :type output_prefix: str (optional) 8749 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8750 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8751 field will be processed to explode the HGVS annotations and update the variant information 8752 accordingly, defaults to ANN 8753 :type snpeff_field: str (optional) 8754 """ 8755 8756 # SnpEff annotation field 8757 snpeff_hgvs = "snpeff_ann_explode" 8758 8759 # Snpeff hgvs tags 8760 vcf_infos_tags = { 8761 snpeff_hgvs: "Explode snpEff annotations", 8762 } 8763 8764 # Prefix 8765 prefix = self.get_explode_infos_prefix() 8766 if prefix: 8767 prefix = "INFO/" 8768 8769 # snpEff fields 8770 speff_ann_infos = prefix + snpeff_field 8771 speff_hgvs_infos = prefix + snpeff_hgvs 8772 8773 # Variants table 8774 table_variants = self.get_table_variants() 8775 8776 # Header 8777 vcf_reader = self.get_header() 8778 8779 # Add columns 8780 added_columns = [] 8781 8782 # Explode HGVS field in column 8783 added_columns += self.explode_infos(fields=[snpeff_field]) 8784 log.debug(f"snpeff_field={snpeff_field}") 8785 log.debug(f"added_columns={added_columns}") 8786 8787 if snpeff_field in vcf_reader.infos: 8788 8789 # Extract ANN header 8790 ann_description = vcf_reader.infos[snpeff_field].desc 8791 pattern = r"'(.+?)'" 8792 match = re.search(pattern, ann_description) 8793 if match: 8794 ann_header_match = match.group(1).split(" | ") 8795 ann_header = [] 8796 ann_header_desc = {} 8797 for i in range(len(ann_header_match)): 8798 ann_header_info = "".join( 8799 char for char in ann_header_match[i] if char.isalnum() 8800 ) 8801 ann_header.append(ann_header_info) 8802 ann_header_desc[ann_header_info] = ann_header_match[i] 8803 if not ann_header_desc: 8804 raise ValueError("Invalid header description format") 8805 else: 8806 raise ValueError("Invalid header description format") 8807 8808 # Create variant id 8809 variant_id_column = self.get_variant_id_column() 8810 added_columns += [variant_id_column] 8811 8812 # Create dataframe 8813 dataframe_snpeff_hgvs = self.get_query_to_df( 8814 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8815 ) 8816 8817 # Create snpEff columns 8818 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8819 speff_ann_infos 8820 ].apply( 8821 lambda x: explode_snpeff_ann( 8822 str(x), 8823 uniquify=uniquify, 8824 output_format=output_format, 8825 prefix=output_prefix, 8826 header=list(ann_header_desc.values()), 8827 ) 8828 ) 8829 8830 # Header 8831 ann_annotations_prefix = "" 8832 if output_format.upper() in ["JSON"]: 8833 ann_annotations_prefix = f"{output_prefix}=" 8834 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8835 output_prefix, 8836 ".", 8837 "String", 8838 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8839 + " - JSON format", 8840 "howard calculation", 8841 "0", 8842 self.code_type_map.get("String"), 8843 ) 8844 else: 8845 for ann_annotation in ann_header: 8846 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8847 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8848 ann_annotation_id, 8849 ".", 8850 "String", 8851 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8852 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8853 "howard calculation", 8854 "0", 8855 self.code_type_map.get("String"), 8856 ) 8857 8858 # Update 8859 sql_update = f""" 8860 UPDATE variants 8861 SET "INFO" = 8862 concat( 8863 CASE 8864 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8865 THEN '' 8866 ELSE concat("INFO", ';') 8867 END, 8868 CASE 8869 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8870 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8871 THEN concat( 8872 '{ann_annotations_prefix}', 8873 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8874 ) 8875 ELSE '' 8876 END 8877 ) 8878 FROM dataframe_snpeff_hgvs 8879 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8880 8881 """ 8882 self.conn.execute(sql_update) 8883 8884 # Delete dataframe 8885 del dataframe_snpeff_hgvs 8886 gc.collect() 8887 8888 else: 8889 8890 log.warning( 8891 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8892 ) 8893 8894 # Remove added columns 8895 for added_column in added_columns: 8896 self.drop_column(column=added_column) 8897 8898 def calculation_extract_nomen(self) -> None: 8899 """ 8900 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8901 """ 8902 8903 # NOMEN field 8904 field_nomen_dict = "NOMEN_DICT" 8905 8906 # NOMEN structure 8907 nomen_dict = { 8908 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8909 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8910 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8911 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8912 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8913 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8914 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8915 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8916 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8917 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8918 } 8919 8920 # Param 8921 param = self.get_param() 8922 8923 # Threads 8924 threads = self.get_threads() 8925 8926 # Prefix 8927 prefix = self.get_explode_infos_prefix() 8928 8929 # Header 8930 vcf_reader = self.get_header() 8931 8932 # Added columns 8933 added_columns = [] 8934 8935 # Get HGVS field 8936 hgvs_field = ( 8937 param.get("calculation", {}) 8938 .get("calculations", {}) 8939 .get("NOMEN", {}) 8940 .get("options", {}) 8941 .get("hgvs_field", "hgvs") 8942 ) 8943 8944 # Get NOMEN pattern 8945 nomen_pattern = ( 8946 param.get("calculation", {}) 8947 .get("calculations", {}) 8948 .get("NOMEN", {}) 8949 .get("options", {}) 8950 .get("pattern", None) 8951 ) 8952 8953 # transcripts list of preference sources 8954 transcripts_sources = {} 8955 8956 # Get transcripts 8957 transcripts_file = ( 8958 param.get("calculation", {}) 8959 .get("calculations", {}) 8960 .get("NOMEN", {}) 8961 .get("options", {}) 8962 .get("transcripts", None) 8963 ) 8964 transcripts_file = full_path(transcripts_file) 8965 if transcripts_file: 8966 if os.path.exists(transcripts_file): 8967 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8968 transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist() 8969 transcripts_sources["file"] = transcripts_from_file 8970 else: 8971 msg_err = f"Transcript file '{transcripts_file}' does NOT exist" 8972 log.error(msg_err) 8973 raise ValueError(msg_err) 8974 8975 # Get transcripts table 8976 transcripts_table = ( 8977 param.get("calculation", {}) 8978 .get("calculations", {}) 8979 .get("NOMEN", {}) 8980 .get("options", {}) 8981 .get("transcripts_table", self.get_table_variants()) 8982 ) 8983 # Get transcripts column 8984 transcripts_column = ( 8985 param.get("calculation", {}) 8986 .get("calculations", {}) 8987 .get("NOMEN", {}) 8988 .get("options", {}) 8989 .get("transcripts_column", None) 8990 ) 8991 8992 if transcripts_table and transcripts_column: 8993 extra_field_transcript = f"{transcripts_table}.{transcripts_column}" 8994 # Explode if not exists 8995 added_columns += self.explode_infos( 8996 fields=[transcripts_column], table=transcripts_table 8997 ) 8998 else: 8999 extra_field_transcript = f"NULL" 9000 9001 # Transcripts of preference source order 9002 transcripts_order = ( 9003 param.get("calculation", {}) 9004 .get("calculations", {}) 9005 .get("NOMEN", {}) 9006 .get("options", {}) 9007 .get("transcripts_order", ["column", "file"]) 9008 ) 9009 9010 # Transcripts from file 9011 transcripts = transcripts_sources.get("file", []) 9012 9013 # Explode HGVS field in column 9014 added_columns += self.explode_infos(fields=[hgvs_field]) 9015 9016 # extra infos 9017 extra_infos = self.get_extra_infos() 9018 extra_field = prefix + hgvs_field 9019 9020 if extra_field in extra_infos: 9021 9022 # Create dataframe 9023 dataframe_hgvs = self.get_query_to_df( 9024 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """ 9025 ) 9026 9027 # Transcripts rank 9028 transcripts_rank = { 9029 transcript: rank for rank, transcript in enumerate(transcripts, start=1) 9030 } 9031 transcripts_len = len(transcripts_rank) 9032 9033 # Create main NOMEN column 9034 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply( 9035 lambda x: find_nomen( 9036 hgvs=x.hgvs, 9037 transcript=x.transcript, 9038 transcripts=transcripts_rank, 9039 pattern=nomen_pattern, 9040 transcripts_source_order=transcripts_order, 9041 transcripts_len=transcripts_len, 9042 ), 9043 axis=1, 9044 ) 9045 9046 # Explode NOMEN Structure and create SQL set for update 9047 sql_nomen_fields = [] 9048 for nomen_field in nomen_dict: 9049 9050 # Create VCF header field 9051 vcf_reader.infos[nomen_field] = vcf.parser._Info( 9052 nomen_field, 9053 ".", 9054 "String", 9055 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 9056 "howard calculation", 9057 "0", 9058 self.code_type_map.get("String"), 9059 ) 9060 9061 # Add field to SQL query update 9062 sql_nomen_fields.append( 9063 f""" 9064 CASE 9065 WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('') 9066 THEN concat( 9067 ';{nomen_field}=', 9068 dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" 9069 ) 9070 ELSE '' 9071 END 9072 """ 9073 ) 9074 9075 # SQL set for update 9076 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 9077 9078 # Update 9079 sql_update = f""" 9080 UPDATE variants 9081 SET "INFO" = 9082 concat( 9083 CASE 9084 WHEN "INFO" IS NULL 9085 THEN '' 9086 ELSE "INFO" 9087 END, 9088 {sql_nomen_fields_set} 9089 ) 9090 FROM dataframe_hgvs 9091 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 9092 AND variants."POS" = dataframe_hgvs."POS" 9093 AND variants."REF" = dataframe_hgvs."REF" 9094 AND variants."ALT" = dataframe_hgvs."ALT" 9095 """ 9096 self.conn.execute(sql_update) 9097 9098 # Delete dataframe 9099 del dataframe_hgvs 9100 gc.collect() 9101 9102 # Remove added columns 9103 for added_column in added_columns: 9104 self.drop_column(column=added_column) 9105 9106 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 9107 """ 9108 The function `calculation_find_by_pipeline` performs a calculation to find the number of 9109 pipeline/sample for a variant and updates the variant information in a VCF file. 9110 9111 :param tag: The `tag` parameter is a string that represents the annotation field for the 9112 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 9113 VCF header and to update the corresponding field in the variants table, defaults to 9114 findbypipeline 9115 :type tag: str (optional) 9116 """ 9117 9118 # if FORMAT and samples 9119 if ( 9120 "FORMAT" in self.get_header_columns_as_list() 9121 and self.get_header_sample_list() 9122 ): 9123 9124 # findbypipeline annotation field 9125 findbypipeline_tag = tag 9126 9127 # VCF infos tags 9128 vcf_infos_tags = { 9129 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 9130 } 9131 9132 # Prefix 9133 prefix = self.get_explode_infos_prefix() 9134 9135 # Field 9136 findbypipeline_infos = prefix + findbypipeline_tag 9137 9138 # Variants table 9139 table_variants = self.get_table_variants() 9140 9141 # Header 9142 vcf_reader = self.get_header() 9143 9144 # Create variant id 9145 variant_id_column = self.get_variant_id_column() 9146 added_columns = [variant_id_column] 9147 9148 # variant_id, FORMAT and samples 9149 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9150 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9151 ) 9152 9153 # Create dataframe 9154 dataframe_findbypipeline = self.get_query_to_df( 9155 f""" SELECT {samples_fields} FROM {table_variants} """ 9156 ) 9157 9158 # Create findbypipeline column 9159 dataframe_findbypipeline[findbypipeline_infos] = ( 9160 dataframe_findbypipeline.apply( 9161 lambda row: findbypipeline( 9162 row, samples=self.get_header_sample_list() 9163 ), 9164 axis=1, 9165 ) 9166 ) 9167 9168 # Add snpeff_hgvs to header 9169 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 9170 findbypipeline_tag, 9171 ".", 9172 "String", 9173 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 9174 "howard calculation", 9175 "0", 9176 self.code_type_map.get("String"), 9177 ) 9178 9179 # Update 9180 sql_update = f""" 9181 UPDATE variants 9182 SET "INFO" = 9183 concat( 9184 CASE 9185 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9186 THEN '' 9187 ELSE concat("INFO", ';') 9188 END, 9189 CASE 9190 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 9191 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 9192 THEN concat( 9193 '{findbypipeline_tag}=', 9194 dataframe_findbypipeline."{findbypipeline_infos}" 9195 ) 9196 ELSE '' 9197 END 9198 ) 9199 FROM dataframe_findbypipeline 9200 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 9201 """ 9202 self.conn.execute(sql_update) 9203 9204 # Remove added columns 9205 for added_column in added_columns: 9206 self.drop_column(column=added_column) 9207 9208 # Delete dataframe 9209 del dataframe_findbypipeline 9210 gc.collect() 9211 9212 def calculation_genotype_concordance(self) -> None: 9213 """ 9214 The function `calculation_genotype_concordance` calculates the genotype concordance for 9215 multi-caller VCF files and updates the variant information in the database. 9216 """ 9217 9218 # if FORMAT and samples 9219 if ( 9220 "FORMAT" in self.get_header_columns_as_list() 9221 and self.get_header_sample_list() 9222 ): 9223 9224 # genotypeconcordance annotation field 9225 genotypeconcordance_tag = "genotypeconcordance" 9226 9227 # VCF infos tags 9228 vcf_infos_tags = { 9229 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 9230 } 9231 9232 # Prefix 9233 prefix = self.get_explode_infos_prefix() 9234 9235 # Field 9236 genotypeconcordance_infos = prefix + genotypeconcordance_tag 9237 9238 # Variants table 9239 table_variants = self.get_table_variants() 9240 9241 # Header 9242 vcf_reader = self.get_header() 9243 9244 # Create variant id 9245 variant_id_column = self.get_variant_id_column() 9246 added_columns = [variant_id_column] 9247 9248 # variant_id, FORMAT and samples 9249 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9250 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9251 ) 9252 9253 # Create dataframe 9254 dataframe_genotypeconcordance = self.get_query_to_df( 9255 f""" SELECT {samples_fields} FROM {table_variants} """ 9256 ) 9257 9258 # Create genotypeconcordance column 9259 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 9260 dataframe_genotypeconcordance.apply( 9261 lambda row: genotypeconcordance( 9262 row, samples=self.get_header_sample_list() 9263 ), 9264 axis=1, 9265 ) 9266 ) 9267 9268 # Add genotypeconcordance to header 9269 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 9270 genotypeconcordance_tag, 9271 ".", 9272 "String", 9273 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 9274 "howard calculation", 9275 "0", 9276 self.code_type_map.get("String"), 9277 ) 9278 9279 # Update 9280 sql_update = f""" 9281 UPDATE variants 9282 SET "INFO" = 9283 concat( 9284 CASE 9285 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9286 THEN '' 9287 ELSE concat("INFO", ';') 9288 END, 9289 CASE 9290 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 9291 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 9292 THEN concat( 9293 '{genotypeconcordance_tag}=', 9294 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 9295 ) 9296 ELSE '' 9297 END 9298 ) 9299 FROM dataframe_genotypeconcordance 9300 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 9301 """ 9302 self.conn.execute(sql_update) 9303 9304 # Remove added columns 9305 for added_column in added_columns: 9306 self.drop_column(column=added_column) 9307 9308 # Delete dataframe 9309 del dataframe_genotypeconcordance 9310 gc.collect() 9311 9312 def calculation_barcode(self, tag: str = "barcode") -> None: 9313 """ 9314 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 9315 updates the INFO field in the file with the calculated barcode values. 9316 9317 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 9318 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 9319 the default tag name is set to "barcode", defaults to barcode 9320 :type tag: str (optional) 9321 """ 9322 9323 # if FORMAT and samples 9324 if ( 9325 "FORMAT" in self.get_header_columns_as_list() 9326 and self.get_header_sample_list() 9327 ): 9328 9329 # barcode annotation field 9330 if not tag: 9331 tag = "barcode" 9332 9333 # VCF infos tags 9334 vcf_infos_tags = { 9335 tag: "barcode calculation (VaRank)", 9336 } 9337 9338 # Prefix 9339 prefix = self.get_explode_infos_prefix() 9340 9341 # Field 9342 barcode_infos = prefix + tag 9343 9344 # Variants table 9345 table_variants = self.get_table_variants() 9346 9347 # Header 9348 vcf_reader = self.get_header() 9349 9350 # Create variant id 9351 variant_id_column = self.get_variant_id_column() 9352 added_columns = [variant_id_column] 9353 9354 # variant_id, FORMAT and samples 9355 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9356 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9357 ) 9358 9359 # Create dataframe 9360 dataframe_barcode = self.get_query_to_df( 9361 f""" SELECT {samples_fields} FROM {table_variants} """ 9362 ) 9363 9364 # Create barcode column 9365 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9366 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 9367 ) 9368 9369 # Add barcode to header 9370 vcf_reader.infos[tag] = vcf.parser._Info( 9371 tag, 9372 ".", 9373 "String", 9374 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 9375 "howard calculation", 9376 "0", 9377 self.code_type_map.get("String"), 9378 ) 9379 9380 # Update 9381 sql_update = f""" 9382 UPDATE {table_variants} 9383 SET "INFO" = 9384 concat( 9385 CASE 9386 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9387 THEN '' 9388 ELSE concat("INFO", ';') 9389 END, 9390 CASE 9391 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 9392 AND dataframe_barcode."{barcode_infos}" NOT NULL 9393 THEN concat( 9394 '{tag}=', 9395 dataframe_barcode."{barcode_infos}" 9396 ) 9397 ELSE '' 9398 END 9399 ) 9400 FROM dataframe_barcode 9401 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9402 """ 9403 self.conn.execute(sql_update) 9404 9405 # Remove added columns 9406 for added_column in added_columns: 9407 self.drop_column(column=added_column) 9408 9409 # Delete dataframe 9410 del dataframe_barcode 9411 gc.collect() 9412 9413 def calculation_barcode_family(self, tag: str = "BCF") -> None: 9414 """ 9415 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 9416 and updates the INFO field in the file with the calculated barcode values. 9417 9418 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 9419 the barcode tag that will be added to the VCF file during the calculation process. If no value 9420 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 9421 :type tag: str (optional) 9422 """ 9423 9424 # if FORMAT and samples 9425 if ( 9426 "FORMAT" in self.get_header_columns_as_list() 9427 and self.get_header_sample_list() 9428 ): 9429 9430 # barcode annotation field 9431 if not tag: 9432 tag = "BCF" 9433 9434 # VCF infos tags 9435 vcf_infos_tags = { 9436 tag: "barcode family calculation", 9437 f"{tag}S": "barcode family samples", 9438 } 9439 9440 # Param 9441 param = self.get_param() 9442 log.debug(f"param={param}") 9443 9444 # Prefix 9445 prefix = self.get_explode_infos_prefix() 9446 9447 # PED param 9448 ped = ( 9449 param.get("calculation", {}) 9450 .get("calculations", {}) 9451 .get("BARCODEFAMILY", {}) 9452 .get("family_pedigree", None) 9453 ) 9454 log.debug(f"ped={ped}") 9455 9456 # Load PED 9457 if ped: 9458 9459 # Pedigree is a file 9460 if isinstance(ped, str) and os.path.exists(full_path(ped)): 9461 log.debug("Pedigree is file") 9462 with open(full_path(ped)) as ped: 9463 ped = yaml.safe_load(ped) 9464 9465 # Pedigree is a string 9466 elif isinstance(ped, str): 9467 log.debug("Pedigree is str") 9468 try: 9469 ped = json.loads(ped) 9470 log.debug("Pedigree is json str") 9471 except ValueError as e: 9472 ped_samples = ped.split(",") 9473 ped = {} 9474 for ped_sample in ped_samples: 9475 ped[ped_sample] = ped_sample 9476 9477 # Pedigree is a dict 9478 elif isinstance(ped, dict): 9479 log.debug("Pedigree is dict") 9480 9481 # Pedigree is not well formatted 9482 else: 9483 msg_error = "Pedigree not well formatted" 9484 log.error(msg_error) 9485 raise ValueError(msg_error) 9486 9487 # Construct list 9488 ped_samples = list(ped.values()) 9489 9490 else: 9491 log.debug("Pedigree not defined. Take all samples") 9492 ped_samples = self.get_header_sample_list() 9493 ped = {} 9494 for ped_sample in ped_samples: 9495 ped[ped_sample] = ped_sample 9496 9497 # Check pedigree 9498 if not ped or len(ped) == 0: 9499 msg_error = f"Error in pedigree: samples {ped_samples}" 9500 log.error(msg_error) 9501 raise ValueError(msg_error) 9502 9503 # Log 9504 log.info( 9505 "Calculation 'BARCODEFAMILY' - Samples: " 9506 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9507 ) 9508 log.debug(f"ped_samples={ped_samples}") 9509 9510 # Field 9511 barcode_infos = prefix + tag 9512 9513 # Variants table 9514 table_variants = self.get_table_variants() 9515 9516 # Header 9517 vcf_reader = self.get_header() 9518 9519 # Create variant id 9520 variant_id_column = self.get_variant_id_column() 9521 added_columns = [variant_id_column] 9522 9523 # variant_id, FORMAT and samples 9524 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9525 [f""" "{sample}" """ for sample in ped_samples] 9526 ) 9527 9528 # Create dataframe 9529 dataframe_barcode = self.get_query_to_df( 9530 f""" SELECT {samples_fields} FROM {table_variants} """ 9531 ) 9532 9533 # Create barcode column 9534 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9535 lambda row: barcode(row, samples=ped_samples), axis=1 9536 ) 9537 9538 # Add barcode family to header 9539 # Add vaf_normalization to header 9540 vcf_reader.formats[tag] = vcf.parser._Format( 9541 id=tag, 9542 num=".", 9543 type="String", 9544 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9545 type_code=self.code_type_map.get("String"), 9546 ) 9547 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9548 id=f"{tag}S", 9549 num=".", 9550 type="String", 9551 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9552 type_code=self.code_type_map.get("String"), 9553 ) 9554 9555 # Update 9556 # for sample in ped_samples: 9557 sql_update_set = [] 9558 for sample in self.get_header_sample_list() + ["FORMAT"]: 9559 if sample in ped_samples: 9560 value = f'dataframe_barcode."{barcode_infos}"' 9561 value_samples = ( 9562 "'" 9563 + ",".join([f""" "{sample}" """ for sample in ped_samples]) 9564 + "'" 9565 ) 9566 ped_samples 9567 elif sample == "FORMAT": 9568 value = f"'{tag}'" 9569 value_samples = f"'{tag}S'" 9570 else: 9571 value = "'.'" 9572 value_samples = "'.'" 9573 format_regex = r"[a-zA-Z0-9\s]" 9574 sql_update_set.append( 9575 f""" 9576 "{sample}" = 9577 concat( 9578 CASE 9579 WHEN {table_variants}."{sample}" = './.' 9580 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9581 ELSE {table_variants}."{sample}" 9582 END, 9583 ':', 9584 {value}, 9585 ':', 9586 {value_samples} 9587 ) 9588 """ 9589 ) 9590 9591 sql_update_set_join = ", ".join(sql_update_set) 9592 sql_update = f""" 9593 UPDATE {table_variants} 9594 SET {sql_update_set_join} 9595 FROM dataframe_barcode 9596 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9597 """ 9598 self.conn.execute(sql_update) 9599 9600 # Remove added columns 9601 for added_column in added_columns: 9602 self.drop_column(column=added_column) 9603 9604 # Delete dataframe 9605 del dataframe_barcode 9606 gc.collect() 9607 9608 def calculation_trio(self) -> None: 9609 """ 9610 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9611 information to the INFO field of each variant. 9612 """ 9613 9614 # if FORMAT and samples 9615 if ( 9616 "FORMAT" in self.get_header_columns_as_list() 9617 and self.get_header_sample_list() 9618 ): 9619 9620 # trio annotation field 9621 trio_tag = "trio" 9622 9623 # VCF infos tags 9624 vcf_infos_tags = { 9625 "trio": "trio calculation", 9626 } 9627 9628 # Param 9629 param = self.get_param() 9630 9631 # Prefix 9632 prefix = self.get_explode_infos_prefix() 9633 9634 # Trio param 9635 trio_ped = ( 9636 param.get("calculation", {}) 9637 .get("calculations", {}) 9638 .get("TRIO", {}) 9639 .get("trio_pedigree", None) 9640 ) 9641 9642 # Load trio 9643 if trio_ped: 9644 9645 # Trio pedigree is a file 9646 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9647 log.debug("TRIO pedigree is file") 9648 with open(full_path(trio_ped)) as trio_ped: 9649 trio_ped = yaml.safe_load(trio_ped) 9650 9651 # Trio pedigree is a string 9652 elif isinstance(trio_ped, str): 9653 log.debug("TRIO pedigree is str") 9654 try: 9655 trio_ped = json.loads(trio_ped) 9656 log.debug("TRIO pedigree is json str") 9657 except ValueError as e: 9658 trio_samples = trio_ped.split(",") 9659 if len(trio_samples) == 3: 9660 trio_ped = { 9661 "father": trio_samples[0], 9662 "mother": trio_samples[1], 9663 "child": trio_samples[2], 9664 } 9665 log.debug("TRIO pedigree is list str") 9666 else: 9667 msg_error = "TRIO pedigree not well formatted" 9668 log.error(msg_error) 9669 raise ValueError(msg_error) 9670 9671 # Trio pedigree is a dict 9672 elif isinstance(trio_ped, dict): 9673 log.debug("TRIO pedigree is dict") 9674 9675 # Trio pedigree is not well formatted 9676 else: 9677 msg_error = "TRIO pedigree not well formatted" 9678 log.error(msg_error) 9679 raise ValueError(msg_error) 9680 9681 # Construct trio list 9682 trio_samples = [ 9683 trio_ped.get("father", ""), 9684 trio_ped.get("mother", ""), 9685 trio_ped.get("child", ""), 9686 ] 9687 9688 else: 9689 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9690 samples_list = self.get_header_sample_list() 9691 if len(samples_list) >= 3: 9692 trio_samples = self.get_header_sample_list()[0:3] 9693 trio_ped = { 9694 "father": trio_samples[0], 9695 "mother": trio_samples[1], 9696 "child": trio_samples[2], 9697 } 9698 else: 9699 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9700 log.error(msg_error) 9701 raise ValueError(msg_error) 9702 9703 # Check trio pedigree 9704 if not trio_ped or len(trio_ped) != 3: 9705 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9706 log.error(msg_error) 9707 raise ValueError(msg_error) 9708 9709 # Log 9710 log.info( 9711 f"Calculation 'TRIO' - Samples: " 9712 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9713 ) 9714 9715 # Field 9716 trio_infos = prefix + trio_tag 9717 9718 # Variants table 9719 table_variants = self.get_table_variants() 9720 9721 # Header 9722 vcf_reader = self.get_header() 9723 9724 # Create variant id 9725 variant_id_column = self.get_variant_id_column() 9726 added_columns = [variant_id_column] 9727 9728 # variant_id, FORMAT and samples 9729 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9730 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9731 ) 9732 9733 # Create dataframe 9734 dataframe_trio = self.get_query_to_df( 9735 f""" SELECT {samples_fields} FROM {table_variants} """ 9736 ) 9737 9738 # Create trio column 9739 dataframe_trio[trio_infos] = dataframe_trio.apply( 9740 lambda row: trio(row, samples=trio_samples), axis=1 9741 ) 9742 9743 # Add trio to header 9744 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9745 trio_tag, 9746 ".", 9747 "String", 9748 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9749 "howard calculation", 9750 "0", 9751 self.code_type_map.get("String"), 9752 ) 9753 9754 # Update 9755 sql_update = f""" 9756 UPDATE {table_variants} 9757 SET "INFO" = 9758 concat( 9759 CASE 9760 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9761 THEN '' 9762 ELSE concat("INFO", ';') 9763 END, 9764 CASE 9765 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9766 AND dataframe_trio."{trio_infos}" NOT NULL 9767 THEN concat( 9768 '{trio_tag}=', 9769 dataframe_trio."{trio_infos}" 9770 ) 9771 ELSE '' 9772 END 9773 ) 9774 FROM dataframe_trio 9775 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9776 """ 9777 self.conn.execute(sql_update) 9778 9779 # Remove added columns 9780 for added_column in added_columns: 9781 self.drop_column(column=added_column) 9782 9783 # Delete dataframe 9784 del dataframe_trio 9785 gc.collect() 9786 9787 def calculation_vaf_normalization(self) -> None: 9788 """ 9789 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9790 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9791 :return: The function does not return anything. 9792 """ 9793 9794 # if FORMAT and samples 9795 if ( 9796 "FORMAT" in self.get_header_columns_as_list() 9797 and self.get_header_sample_list() 9798 ): 9799 9800 # vaf_normalization annotation field 9801 vaf_normalization_tag = "VAF" 9802 9803 # VCF infos tags 9804 vcf_infos_tags = { 9805 "VAF": "VAF Variant Frequency", 9806 } 9807 9808 # Prefix 9809 prefix = self.get_explode_infos_prefix() 9810 9811 # Variants table 9812 table_variants = self.get_table_variants() 9813 9814 # Header 9815 vcf_reader = self.get_header() 9816 9817 # Do not calculate if VAF already exists 9818 if "VAF" in vcf_reader.formats: 9819 log.debug("VAF already on genotypes") 9820 return 9821 9822 # Create variant id 9823 variant_id_column = self.get_variant_id_column() 9824 added_columns = [variant_id_column] 9825 9826 # variant_id, FORMAT and samples 9827 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9828 f""" "{sample}" """ for sample in self.get_header_sample_list() 9829 ) 9830 9831 # Create dataframe 9832 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9833 log.debug(f"query={query}") 9834 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9835 9836 vaf_normalization_set = [] 9837 9838 # for each sample vaf_normalization 9839 for sample in self.get_header_sample_list(): 9840 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9841 lambda row: vaf_normalization(row, sample=sample), axis=1 9842 ) 9843 vaf_normalization_set.append( 9844 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9845 ) 9846 9847 # Add VAF to FORMAT 9848 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9849 "FORMAT" 9850 ].apply(lambda x: str(x) + ":VAF") 9851 vaf_normalization_set.append( 9852 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9853 ) 9854 9855 # Add vaf_normalization to header 9856 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9857 id=vaf_normalization_tag, 9858 num="1", 9859 type="Float", 9860 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9861 type_code=self.code_type_map.get("Float"), 9862 ) 9863 9864 # Create fields to add in INFO 9865 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9866 9867 # Update 9868 sql_update = f""" 9869 UPDATE {table_variants} 9870 SET {sql_vaf_normalization_set} 9871 FROM dataframe_vaf_normalization 9872 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9873 9874 """ 9875 self.conn.execute(sql_update) 9876 9877 # Remove added columns 9878 for added_column in added_columns: 9879 self.drop_column(column=added_column) 9880 9881 # Delete dataframe 9882 del dataframe_vaf_normalization 9883 gc.collect() 9884 9885 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9886 """ 9887 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9888 field in a VCF file and updates the INFO column of the variants table with the calculated 9889 statistics. 9890 9891 :param info: The `info` parameter is a string that represents the type of information for which 9892 genotype statistics are calculated. It is used to generate various VCF info tags for the 9893 statistics, such as the number of occurrences, the list of values, the minimum value, the 9894 maximum value, the mean, the median, defaults to VAF 9895 :type info: str (optional) 9896 """ 9897 9898 # if FORMAT and samples 9899 if ( 9900 "FORMAT" in self.get_header_columns_as_list() 9901 and self.get_header_sample_list() 9902 ): 9903 9904 # vaf_stats annotation field 9905 vaf_stats_tag = info + "_stats" 9906 9907 # VCF infos tags 9908 vcf_infos_tags = { 9909 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9910 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9911 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9912 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9913 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9914 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9915 info 9916 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9917 } 9918 9919 # Prefix 9920 prefix = self.get_explode_infos_prefix() 9921 9922 # Field 9923 vaf_stats_infos = prefix + vaf_stats_tag 9924 9925 # Variants table 9926 table_variants = self.get_table_variants() 9927 9928 # Header 9929 vcf_reader = self.get_header() 9930 9931 # Create variant id 9932 variant_id_column = self.get_variant_id_column() 9933 added_columns = [variant_id_column] 9934 9935 # variant_id, FORMAT and samples 9936 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9937 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9938 ) 9939 9940 # Create dataframe 9941 dataframe_vaf_stats = self.get_query_to_df( 9942 f""" SELECT {samples_fields} FROM {table_variants} """ 9943 ) 9944 9945 # Create vaf_stats column 9946 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9947 lambda row: genotype_stats( 9948 row, samples=self.get_header_sample_list(), info=info 9949 ), 9950 axis=1, 9951 ) 9952 9953 # List of vcf tags 9954 sql_vaf_stats_fields = [] 9955 9956 # Check all VAF stats infos 9957 for stat in vcf_infos_tags: 9958 9959 # Extract stats 9960 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9961 lambda x: dict(x).get(stat, "") 9962 ) 9963 9964 # Add snpeff_hgvs to header 9965 vcf_reader.infos[stat] = vcf.parser._Info( 9966 stat, 9967 ".", 9968 "String", 9969 vcf_infos_tags.get(stat, "genotype statistics"), 9970 "howard calculation", 9971 "0", 9972 self.code_type_map.get("String"), 9973 ) 9974 9975 if len(sql_vaf_stats_fields): 9976 sep = ";" 9977 else: 9978 sep = "" 9979 9980 # Create fields to add in INFO 9981 sql_vaf_stats_fields.append( 9982 f""" 9983 CASE 9984 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9985 THEN concat( 9986 '{sep}{stat}=', 9987 dataframe_vaf_stats."{stat}" 9988 ) 9989 ELSE '' 9990 END 9991 """ 9992 ) 9993 9994 # SQL set for update 9995 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9996 9997 # Update 9998 sql_update = f""" 9999 UPDATE {table_variants} 10000 SET "INFO" = 10001 concat( 10002 CASE 10003 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10004 THEN '' 10005 ELSE concat("INFO", ';') 10006 END, 10007 {sql_vaf_stats_fields_set} 10008 ) 10009 FROM dataframe_vaf_stats 10010 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 10011 10012 """ 10013 self.conn.execute(sql_update) 10014 10015 # Remove added columns 10016 for added_column in added_columns: 10017 self.drop_column(column=added_column) 10018 10019 # Delete dataframe 10020 del dataframe_vaf_stats 10021 gc.collect() 10022 10023 def calculation_transcripts_annotation( 10024 self, info_json: str = None, info_format: str = None 10025 ) -> None: 10026 """ 10027 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 10028 field to it if transcripts are available. 10029 10030 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 10031 is a string parameter that represents the information field to be used in the transcripts JSON. 10032 It is used to specify the JSON format for the transcripts information. If no value is provided 10033 when calling the method, it defaults to " 10034 :type info_json: str 10035 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 10036 method is a string parameter that specifies the format of the information field to be used in 10037 the transcripts JSON. It is used to define the format of the information field 10038 :type info_format: str 10039 """ 10040 10041 # Create transcripts table 10042 transcripts_table = self.create_transcript_view() 10043 10044 # Add info field 10045 if transcripts_table: 10046 self.transcript_view_to_variants( 10047 transcripts_table=transcripts_table, 10048 transcripts_info_field_json=info_json, 10049 transcripts_info_field_format=info_format, 10050 ) 10051 else: 10052 log.info("No Transcripts to process. Check param.json file configuration") 10053 10054 def calculation_transcripts_prioritization(self) -> None: 10055 """ 10056 The function `calculation_transcripts_prioritization` creates a transcripts table and 10057 prioritizes transcripts based on certain criteria. 10058 """ 10059 10060 # Create transcripts table 10061 transcripts_table = self.create_transcript_view() 10062 10063 # Add info field 10064 if transcripts_table: 10065 self.transcripts_prioritization(transcripts_table=transcripts_table) 10066 else: 10067 log.info("No Transcripts to process. Check param.json file configuration") 10068 10069 def calculation_transcripts_export(self) -> None: 10070 """ """ 10071 10072 # Create transcripts table 10073 transcripts_table = self.create_transcript_view() 10074 10075 # Add info field 10076 if transcripts_table: 10077 self.transcripts_export(transcripts_table=transcripts_table) 10078 else: 10079 log.info("No Transcripts to process. Check param.json file configuration") 10080 10081 ############### 10082 # Transcripts # 10083 ############### 10084 10085 def transcripts_export( 10086 self, transcripts_table: str = None, param: dict = {} 10087 ) -> bool: 10088 """ """ 10089 10090 log.debug("Start transcripts export...") 10091 10092 # Param 10093 if not param: 10094 param = self.get_param() 10095 10096 # Param export 10097 param_transcript_export = param.get("transcripts", {}).get("export", {}) 10098 10099 # Output file 10100 transcripts_export_output = param_transcript_export.get("output", None) 10101 10102 if not param_transcript_export or not transcripts_export_output: 10103 log.warning(f"No transcriipts export parameters defined!") 10104 return False 10105 10106 # List of transcripts annotations 10107 query_describe = f""" 10108 SELECT column_name 10109 FROM ( 10110 DESCRIBE SELECT * FROM {transcripts_table} 10111 ) 10112 WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO') 10113 """ 10114 transcripts_annotations_list = list( 10115 self.get_query_to_df(query=query_describe)["column_name"] 10116 ) 10117 10118 # Create transcripts table for export 10119 transcripts_table_export = f"{transcripts_table}_export_" + "".join( 10120 random.choices(string.ascii_uppercase + string.digits, k=10) 10121 ) 10122 query_create_transcripts_table_export = f""" 10123 CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table}) 10124 """ 10125 self.execute_query(query=query_create_transcripts_table_export) 10126 10127 # Output file format 10128 transcripts_export_output_format = get_file_format( 10129 filename=transcripts_export_output 10130 ) 10131 10132 # Format VCF - construct INFO 10133 if transcripts_export_output_format in ["vcf"]: 10134 10135 # Construct query update INFO and header 10136 query_update_info = [] 10137 for field in transcripts_annotations_list: 10138 10139 # If field not in header 10140 if field not in self.get_header_infos_list(): 10141 10142 # Add PZ Transcript in header 10143 self.get_header().infos[field] = vcf.parser._Info( 10144 field, 10145 ".", 10146 "String", 10147 f"Annotation '{field}' from transcript view", 10148 "unknown", 10149 "unknown", 10150 0, 10151 ) 10152 10153 # Add field as INFO/tag 10154 query_update_info.append( 10155 f""" 10156 CASE 10157 WHEN "{field}" IS NOT NULL 10158 THEN concat('{field}=', "{field}", ';') 10159 ELSE '' 10160 END 10161 """ 10162 ) 10163 10164 # Query param 10165 query_update_info_value = ( 10166 f""" concat('', {", ".join(query_update_info)}) """ 10167 ) 10168 query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """ 10169 10170 else: 10171 10172 # Query param 10173 query_update_info_value = f""" NULL """ 10174 query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """ 10175 10176 # Update query INFO column 10177 query_update = f""" 10178 UPDATE {transcripts_table_export} 10179 SET INFO = {query_update_info_value} 10180 10181 """ 10182 self.execute_query(query=query_update) 10183 10184 # Export 10185 self.export_output( 10186 output_file=transcripts_export_output, 10187 query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """, 10188 ) 10189 10190 # Drop transcripts export table 10191 query_drop_transcripts_table_export = f""" 10192 DROP TABLE {transcripts_table_export} 10193 """ 10194 self.execute_query(query=query_drop_transcripts_table_export) 10195 10196 def transcripts_prioritization( 10197 self, transcripts_table: str = None, param: dict = {} 10198 ) -> bool: 10199 """ 10200 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 10201 and updates the variants table with the prioritized information. 10202 10203 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10204 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 10205 This parameter is used to identify the table where the transcripts data is stored for the 10206 prioritization process 10207 :type transcripts_table: str 10208 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 10209 that contains various configuration settings for the prioritization process of transcripts. It 10210 is used to customize the behavior of the prioritization algorithm and includes settings such as 10211 the prefix for prioritization fields, default profiles, and other 10212 :type param: dict 10213 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 10214 transcripts prioritization process is successfully completed, and `False` if there are any 10215 issues or if no profile is defined for transcripts prioritization. 10216 """ 10217 10218 log.debug("Start transcripts prioritization...") 10219 10220 # Param 10221 if not param: 10222 param = self.get_param() 10223 10224 # Variants table 10225 table_variants = self.get_table_variants() 10226 10227 # Transcripts table 10228 if transcripts_table is None: 10229 transcripts_table = self.create_transcript_view( 10230 transcripts_table="transcripts", param=param 10231 ) 10232 if transcripts_table is None: 10233 msg_err = "No Transcripts table availalble" 10234 log.error(msg_err) 10235 raise ValueError(msg_err) 10236 log.debug(f"transcripts_table={transcripts_table}") 10237 10238 # Get transcripts columns 10239 columns_as_list_query = f""" 10240 DESCRIBE {transcripts_table} 10241 """ 10242 columns_as_list = list( 10243 self.get_query_to_df(columns_as_list_query)["column_name"] 10244 ) 10245 10246 # Create INFO if not exists 10247 if "INFO" not in columns_as_list: 10248 query_add_info = f""" 10249 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 10250 """ 10251 self.execute_query(query_add_info) 10252 10253 # Prioritization param and Force only PZ Score and Flag 10254 pz_param = param.get("transcripts", {}).get("prioritization", {}) 10255 10256 # PZ profile by default 10257 pz_profile_default = ( 10258 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 10259 ) 10260 10261 # Exit if no profile 10262 if pz_profile_default is None: 10263 log.warning("No profile defined for transcripts prioritization") 10264 return False 10265 10266 # PZ fields 10267 pz_param_pzfields = {} 10268 10269 # PZ field transcripts 10270 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 10271 10272 # Add PZ Transcript in header 10273 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 10274 pz_fields_transcripts, 10275 ".", 10276 "String", 10277 f"Transcript selected from prioritization process, profile {pz_profile_default}", 10278 "unknown", 10279 "unknown", 10280 code_type_map["String"], 10281 ) 10282 10283 # Mandatory fields 10284 pz_mandatory_fields_list = [ 10285 "Score", 10286 "Flag", 10287 "Tags", 10288 "Comment", 10289 "Infos", 10290 "Class", 10291 ] 10292 pz_mandatory_fields = [] 10293 for pz_mandatory_field in pz_mandatory_fields_list: 10294 pz_mandatory_fields.append( 10295 pz_param.get("pzprefix", "PTZ") + pz_mandatory_field 10296 ) 10297 10298 # PZ fields in param 10299 for pz_field in pz_param.get("pzfields", []): 10300 if pz_field in pz_mandatory_fields_list: 10301 pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = ( 10302 pz_param.get("pzprefix", "PTZ") + pz_field 10303 ) 10304 else: 10305 pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field 10306 pz_param_pzfields[pz_field] = pz_field_new 10307 10308 # Add PZ Transcript in header 10309 self.get_header().infos[pz_field_new] = vcf.parser._Info( 10310 pz_field_new, 10311 ".", 10312 "String", 10313 f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}", 10314 "unknown", 10315 "unknown", 10316 code_type_map["String"], 10317 ) 10318 10319 # PZ fields param 10320 pz_param["pzfields"] = pz_mandatory_fields 10321 10322 # Prioritization 10323 prioritization_result = self.prioritization( 10324 table=transcripts_table, 10325 pz_param=param.get("transcripts", {}).get("prioritization", {}), 10326 ) 10327 if not prioritization_result: 10328 log.warning("Transcripts prioritization not processed") 10329 return False 10330 10331 # PZ fields sql query 10332 query_update_select_list = [] 10333 query_update_concat_list = [] 10334 query_update_order_list = [] 10335 for pz_param_pzfield in set( 10336 list(pz_param_pzfields.keys()) + pz_mandatory_fields 10337 ): 10338 query_update_select_list.append(f" {pz_param_pzfield}, ") 10339 10340 for pz_param_pzfield in pz_param_pzfields: 10341 query_update_concat_list.append( 10342 f""" 10343 , CASE 10344 WHEN {pz_param_pzfield} IS NOT NULL 10345 THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield}) 10346 ELSE '' 10347 END 10348 """ 10349 ) 10350 10351 # Order by 10352 pz_orders = ( 10353 param.get("transcripts", {}) 10354 .get("prioritization", {}) 10355 .get("prioritization_transcripts_order", {}) 10356 ) 10357 if not pz_orders: 10358 pz_orders = { 10359 pz_param.get("pzprefix", "PTZ") + "Flag": "DESC", 10360 pz_param.get("pzprefix", "PTZ") + "Score": "DESC", 10361 } 10362 for pz_order in pz_orders: 10363 query_update_order_list.append( 10364 f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """ 10365 ) 10366 10367 # Fields to explode 10368 fields_to_explode = ( 10369 list(pz_param_pzfields.keys()) 10370 + pz_mandatory_fields 10371 + list(pz_orders.keys()) 10372 ) 10373 # Remove transcript column as a specific transcript column 10374 if "transcript" in fields_to_explode: 10375 fields_to_explode.remove("transcript") 10376 10377 # Fields intranscripts table 10378 query_transcripts_table = f""" 10379 DESCRIBE SELECT * FROM {transcripts_table} 10380 """ 10381 query_transcripts_table = self.get_query_to_df(query=query_transcripts_table) 10382 10383 # Check fields to explode 10384 for field_to_explode in fields_to_explode: 10385 if field_to_explode not in self.get_header_infos_list() + list( 10386 query_transcripts_table.column_name 10387 ): 10388 msg_err = f"INFO/{field_to_explode} NOT IN header" 10389 log.error(msg_err) 10390 raise ValueError(msg_err) 10391 10392 # Explode fields to explode 10393 self.explode_infos( 10394 table=transcripts_table, 10395 fields=fields_to_explode, 10396 ) 10397 10398 # Transcript preference file 10399 transcripts_preference_file = ( 10400 param.get("transcripts", {}) 10401 .get("prioritization", {}) 10402 .get("prioritization_transcripts", {}) 10403 ) 10404 transcripts_preference_file = full_path(transcripts_preference_file) 10405 10406 # Transcript preference forced 10407 transcript_preference_force = ( 10408 param.get("transcripts", {}) 10409 .get("prioritization", {}) 10410 .get("prioritization_transcripts_force", False) 10411 ) 10412 # Transcript version forced 10413 transcript_version_force = ( 10414 param.get("transcripts", {}) 10415 .get("prioritization", {}) 10416 .get("prioritization_transcripts_version_force", False) 10417 ) 10418 10419 # Transcripts Ranking 10420 if transcripts_preference_file: 10421 10422 # Transcripts file to dataframe 10423 if os.path.exists(transcripts_preference_file): 10424 transcripts_preference_dataframe = transcripts_file_to_df( 10425 transcripts_preference_file 10426 ) 10427 else: 10428 log.error( 10429 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10430 ) 10431 raise ValueError( 10432 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10433 ) 10434 10435 # Order by depending to transcript preference forcing 10436 if transcript_preference_force: 10437 order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """ 10438 else: 10439 order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """ 10440 10441 # Transcript columns joined depend on version consideration 10442 if transcript_version_force: 10443 transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """ 10444 else: 10445 transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """ 10446 10447 # Query ranking for update 10448 query_update_ranking = f""" 10449 SELECT 10450 "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)} 10451 ROW_NUMBER() OVER ( 10452 PARTITION BY "#CHROM", POS, REF, ALT 10453 ORDER BY {order_by} 10454 ) AS rn 10455 FROM {transcripts_table} 10456 LEFT JOIN 10457 ( 10458 SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order 10459 FROM transcripts_preference_dataframe 10460 ) AS transcripts_preference 10461 ON {transcripts_version_join} 10462 """ 10463 10464 else: 10465 10466 # Query ranking for update 10467 query_update_ranking = f""" 10468 SELECT 10469 "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)} 10470 ROW_NUMBER() OVER ( 10471 PARTITION BY "#CHROM", POS, REF, ALT 10472 ORDER BY {" , ".join(query_update_order_list)} 10473 ) AS rn 10474 FROM {transcripts_table} 10475 """ 10476 10477 # Export Transcripts prioritization infos to variants table 10478 query_update = f""" 10479 WITH RankedTranscripts AS ( 10480 {query_update_ranking} 10481 ) 10482 UPDATE {table_variants} 10483 SET 10484 INFO = CONCAT(CASE 10485 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10486 THEN '' 10487 ELSE concat("INFO", ';') 10488 END, 10489 concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)}) 10490 ) 10491 FROM 10492 RankedTranscripts 10493 WHERE 10494 rn = 1 10495 AND variants."#CHROM" = RankedTranscripts."#CHROM" 10496 AND variants."POS" = RankedTranscripts."POS" 10497 AND variants."REF" = RankedTranscripts."REF" 10498 AND variants."ALT" = RankedTranscripts."ALT" 10499 """ 10500 10501 # log.debug(f"query_update={query_update}") 10502 self.execute_query(query=query_update) 10503 10504 # Return 10505 return True 10506 10507 def create_transcript_view_from_columns_map( 10508 self, 10509 transcripts_table: str = "transcripts", 10510 columns_maps: dict = {}, 10511 added_columns: list = [], 10512 temporary_tables: list = None, 10513 annotation_fields: list = None, 10514 column_rename: dict = {}, 10515 column_clean: bool = False, 10516 column_case: str = None, 10517 ) -> tuple[list, list, list]: 10518 """ 10519 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 10520 specified columns mapping for transcripts data. 10521 10522 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10523 of the table where the transcripts data is stored or will be stored in the database. This table 10524 typically contains information about transcripts such as Ensembl transcript IDs, gene names, 10525 scores, predictions, etc. It defaults to "transcripts, defaults to transcripts 10526 :type transcripts_table: str (optional) 10527 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information 10528 about how to map columns from a transcripts table to create a view. Each entry in the 10529 `columns_maps` list represents a mapping configuration for a specific set of columns. It 10530 typically includes details such as the main transcript column and additional information columns 10531 :type columns_maps: dict 10532 :param added_columns: The `added_columns` parameter in the 10533 `create_transcript_view_from_columns_map` function is a list that stores the additional columns 10534 that will be added to the view being created based on the columns map provided. These columns 10535 are generated by exploding the transcript information columns along with the main transcript 10536 column 10537 :type added_columns: list 10538 :param temporary_tables: The `temporary_tables` parameter in the 10539 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 10540 tables created during the process of creating a transcript view from a columns map. These 10541 temporary tables are used to store intermediate results or transformations before the final view 10542 is generated 10543 :type temporary_tables: list 10544 :param annotation_fields: The `annotation_fields` parameter in the 10545 `create_transcript_view_from_columns_map` function is a list that stores the fields that are 10546 used for annotation in the query view creation process. These fields are extracted from the 10547 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 10548 :type annotation_fields: list 10549 :param column_rename: The `column_rename` parameter in the 10550 `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify 10551 custom renaming for columns during the creation of the temporary table view. This parameter 10552 provides a mapping of original column names to the desired renamed column names. By using this 10553 parameter, 10554 :type column_rename: dict 10555 :param column_clean: The `column_clean` parameter in the 10556 `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the 10557 column values should be cleaned or not. If set to `True`, the column values will be cleaned by 10558 removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to 10559 False 10560 :type column_clean: bool (optional) 10561 :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map` 10562 function is used to specify the case transformation to be applied to the columns during the view 10563 creation process. It allows you to control whether the column values should be converted to 10564 lowercase, uppercase, or remain unchanged 10565 :type column_case: str 10566 :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three 10567 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 10568 """ 10569 10570 log.debug("Start transcrpts view creation from columns map...") 10571 10572 # "from_columns_map": [ 10573 # { 10574 # "transcripts_column": "Ensembl_transcriptid", 10575 # "transcripts_infos_columns": [ 10576 # "genename", 10577 # "Ensembl_geneid", 10578 # "LIST_S2_score", 10579 # "LIST_S2_pred", 10580 # ], 10581 # }, 10582 # { 10583 # "transcripts_column": "Ensembl_transcriptid", 10584 # "transcripts_infos_columns": [ 10585 # "genename", 10586 # "VARITY_R_score", 10587 # "Aloft_pred", 10588 # ], 10589 # }, 10590 # ], 10591 10592 # Init 10593 if temporary_tables is None: 10594 temporary_tables = [] 10595 if annotation_fields is None: 10596 annotation_fields = [] 10597 10598 # Variants table 10599 table_variants = self.get_table_variants() 10600 10601 for columns_map in columns_maps: 10602 10603 # Log 10604 log.debug(f"columns_map={columns_map}") 10605 10606 # Transcript column 10607 transcripts_column = columns_map.get("transcripts_column", None) 10608 10609 # Transcripts infos columns 10610 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 10611 10612 # Transcripts infos columns rename 10613 column_rename = columns_map.get("column_rename", column_rename) 10614 10615 # Transcripts infos columns clean 10616 column_clean = columns_map.get("column_clean", column_clean) 10617 10618 # Transcripts infos columns case 10619 column_case = columns_map.get("column_case", column_case) 10620 10621 if transcripts_column is not None: 10622 10623 # Explode 10624 added_columns += self.explode_infos( 10625 fields=[transcripts_column] + transcripts_infos_columns 10626 ) 10627 10628 # View clauses 10629 clause_select_variants = [] 10630 clause_select_tanscripts = [] 10631 for field in [transcripts_column] + transcripts_infos_columns: 10632 10633 # AS field 10634 as_field = field 10635 10636 # Rename 10637 if column_rename: 10638 as_field = column_rename.get(as_field, as_field) 10639 10640 # Clean 10641 if column_clean: 10642 as_field = clean_annotation_field(as_field) 10643 10644 # Case 10645 if column_case: 10646 if column_case.lower() in ["lower"]: 10647 as_field = as_field.lower() 10648 elif column_case.lower() in ["upper"]: 10649 as_field = as_field.upper() 10650 10651 # Clause select Variants 10652 clause_select_variants.append( 10653 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10654 ) 10655 10656 if field in [transcripts_column]: 10657 clause_select_tanscripts.append( 10658 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10659 ) 10660 else: 10661 clause_select_tanscripts.append( 10662 f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """ 10663 ) 10664 annotation_fields.append(as_field) 10665 10666 # Query View 10667 query = f""" 10668 SELECT 10669 "#CHROM", POS, REF, ALT, INFO, 10670 "{transcripts_column}" AS 'transcript', 10671 {", ".join(clause_select_tanscripts)} 10672 FROM ( 10673 SELECT 10674 "#CHROM", POS, REF, ALT, INFO, 10675 {", ".join(clause_select_variants)} 10676 FROM {table_variants} 10677 ) 10678 WHERE "{transcripts_column}" IS NOT NULL 10679 """ 10680 10681 # Create temporary table 10682 temporary_table = transcripts_table + "".join( 10683 random.choices(string.ascii_uppercase + string.digits, k=10) 10684 ) 10685 10686 # # Temporary_tables 10687 # temporary_tables.append(temporary_table) 10688 # query_view = f""" 10689 # CREATE TEMPORARY TABLE {temporary_table} 10690 # AS ({query}) 10691 # """ 10692 # self.execute_query(query=query_view) 10693 10694 # Temporary_tables 10695 temporary_tables.append(temporary_table) 10696 10697 # List of unique #CHROM 10698 query_unique_chrom = f""" 10699 SELECT DISTINCT "#CHROM" 10700 FROM variants 10701 """ 10702 unique_chroms = self.get_query_to_df(query=query_unique_chrom) 10703 10704 # Create table with structure but without data 10705 query_create_table = f""" 10706 CREATE TABLE {temporary_table} 10707 AS ({query} LIMIT 0) 10708 """ 10709 self.execute_query(query=query_create_table) 10710 10711 # Process by #CHROM 10712 for chrom in unique_chroms["#CHROM"]: 10713 10714 # Log 10715 log.debug(f"Processing #CHROM={chrom}") 10716 10717 # Select data by #CHROM 10718 query_chunk = f""" 10719 SELECT * 10720 FROM ({query}) 10721 WHERE "#CHROM" = '{chrom}' 10722 """ 10723 10724 # Insert data 10725 query_insert_chunk = f""" 10726 INSERT INTO {temporary_table} 10727 {query_chunk} 10728 """ 10729 self.execute_query(query=query_insert_chunk) 10730 10731 return added_columns, temporary_tables, annotation_fields 10732 10733 def create_transcript_view_from_column_format( 10734 self, 10735 transcripts_table: str = "transcripts", 10736 column_formats: dict = {}, 10737 temporary_tables: list = None, 10738 annotation_fields: list = None, 10739 column_rename: dict = {}, 10740 column_clean: bool = False, 10741 column_case: str = None, 10742 ) -> tuple[list, list, list]: 10743 """ 10744 The `create_transcript_view_from_column_format` function generates a transcript view based on 10745 specified column formats, adds additional columns and annotation fields, and returns the list of 10746 temporary tables and annotation fields. 10747 10748 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10749 of the table containing the transcripts data. This table will be used as the base table for 10750 creating the transcript view. The default value for this parameter is "transcripts", but you can 10751 provide a different table name if needed, defaults to transcripts 10752 :type transcripts_table: str (optional) 10753 :param column_formats: The `column_formats` parameter is a dictionary that contains information 10754 about the columns to be used for creating the transcript view. Each entry in the dictionary 10755 specifies the mapping between a transcripts column and a transcripts infos column. This 10756 parameter allows you to define how the columns from the transcripts table should be transformed 10757 or mapped 10758 :type column_formats: dict 10759 :param temporary_tables: The `temporary_tables` parameter in the 10760 `create_transcript_view_from_column_format` function is a list that stores the names of 10761 temporary views created during the process of creating a transcript view from a column format. 10762 These temporary views are used to manipulate and extract data before generating the final 10763 transcript view 10764 :type temporary_tables: list 10765 :param annotation_fields: The `annotation_fields` parameter in the 10766 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 10767 that are extracted from the temporary views created during the process. These annotation fields 10768 are obtained by querying the temporary views and extracting the column names excluding specific 10769 columns like `#CH 10770 :type annotation_fields: list 10771 :param column_rename: The `column_rename` parameter in the 10772 `create_transcript_view_from_column_format` function is a dictionary that allows you to specify 10773 custom renaming of columns in the transcripts infos table. By providing a mapping of original 10774 column names to new column names in this dictionary, you can rename specific columns during the 10775 process 10776 :type column_rename: dict 10777 :param column_clean: The `column_clean` parameter in the 10778 `create_transcript_view_from_column_format` function is a boolean flag that determines whether 10779 the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns 10780 will be cleaned during the creation of the transcript view based on the specified column format, 10781 defaults to False 10782 :type column_clean: bool (optional) 10783 :param column_case: The `column_case` parameter in the 10784 `create_transcript_view_from_column_format` function is used to specify the case transformation 10785 to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" 10786 to convert the column names to uppercase or lowercase, respectively 10787 :type column_case: str 10788 :return: The `create_transcript_view_from_column_format` function returns two lists: 10789 `temporary_tables` and `annotation_fields`. 10790 """ 10791 10792 log.debug("Start transcrpts view creation from column format...") 10793 10794 # "from_column_format": [ 10795 # { 10796 # "transcripts_column": "ANN", 10797 # "transcripts_infos_column": "Feature_ID", 10798 # } 10799 # ], 10800 10801 # Init 10802 if temporary_tables is None: 10803 temporary_tables = [] 10804 if annotation_fields is None: 10805 annotation_fields = [] 10806 10807 for column_format in column_formats: 10808 10809 # annotation field and transcript annotation field 10810 annotation_field = column_format.get("transcripts_column", "ANN") 10811 transcript_annotation = column_format.get( 10812 "transcripts_infos_column", "Feature_ID" 10813 ) 10814 10815 # Transcripts infos columns rename 10816 column_rename = column_format.get("column_rename", column_rename) 10817 10818 # Transcripts infos columns clean 10819 column_clean = column_format.get("column_clean", column_clean) 10820 10821 # Transcripts infos columns case 10822 column_case = column_format.get("column_case", column_case) 10823 10824 # Temporary View name 10825 temporary_view_name = transcripts_table + "".join( 10826 random.choices(string.ascii_uppercase + string.digits, k=10) 10827 ) 10828 10829 # Create temporary view name 10830 temporary_view_name = self.annotation_format_to_table( 10831 uniquify=True, 10832 annotation_field=annotation_field, 10833 view_name=temporary_view_name, 10834 annotation_id=transcript_annotation, 10835 column_rename=column_rename, 10836 column_clean=column_clean, 10837 column_case=column_case, 10838 ) 10839 10840 # Annotation fields 10841 if temporary_view_name: 10842 query_annotation_fields = f""" 10843 SELECT * 10844 FROM ( 10845 DESCRIBE SELECT * 10846 FROM {temporary_view_name} 10847 ) 10848 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 10849 """ 10850 df_annotation_fields = self.get_query_to_df( 10851 query=query_annotation_fields 10852 ) 10853 10854 # Add temporary view and annotation fields 10855 temporary_tables.append(temporary_view_name) 10856 annotation_fields += list(set(df_annotation_fields["column_name"])) 10857 10858 return temporary_tables, annotation_fields 10859 10860 def create_transcript_view( 10861 self, 10862 transcripts_table: str = None, 10863 transcripts_table_drop: bool = False, 10864 param: dict = {}, 10865 ) -> str: 10866 """ 10867 The `create_transcript_view` function generates a transcript view by processing data from a 10868 specified table based on provided parameters and structural information. 10869 10870 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 10871 is used to specify the name of the table that will store the final transcript view data. If a table 10872 name is not provided, the function will create a new table to store the transcript view data, and by 10873 default,, defaults to transcripts 10874 :type transcripts_table: str (optional) 10875 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 10876 `create_transcript_view` function is a boolean parameter that determines whether to drop the 10877 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 10878 the function will drop the existing transcripts table if it exists, defaults to False 10879 :type transcripts_table_drop: bool (optional) 10880 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 10881 contains information needed to create a transcript view. It includes details such as the structure 10882 of the transcripts, columns mapping, column formats, and other necessary information for generating 10883 the view. This parameter allows for flexibility and customization 10884 :type param: dict 10885 :return: The `create_transcript_view` function returns the name of the transcripts table that was 10886 created or modified during the execution of the function. 10887 """ 10888 10889 log.debug("Start transcripts view creation...") 10890 10891 # Default 10892 transcripts_table_default = "transcripts" 10893 10894 # Param 10895 if not param: 10896 param = self.get_param() 10897 10898 # Struct 10899 struct = param.get("transcripts", {}).get("struct", None) 10900 10901 # Transcript veresion 10902 transcript_id_remove_version = param.get("transcripts", {}).get( 10903 "transcript_id_remove_version", False 10904 ) 10905 10906 # Transcripts mapping 10907 transcript_id_mapping_file = param.get("transcripts", {}).get( 10908 "transcript_id_mapping_file", None 10909 ) 10910 10911 # Transcripts mapping 10912 transcript_id_mapping_force = param.get("transcripts", {}).get( 10913 "transcript_id_mapping_force", None 10914 ) 10915 10916 # Transcripts table 10917 if transcripts_table is None: 10918 transcripts_table = param.get("transcripts", {}).get( 10919 "table", transcripts_table_default 10920 ) 10921 10922 # Check transcripts table exists 10923 if transcripts_table: 10924 10925 # Query to check if transcripts table exists 10926 query_check_table = f""" 10927 SELECT * 10928 FROM information_schema.tables 10929 WHERE table_name = '{transcripts_table}' 10930 """ 10931 df_check_table = self.get_query_to_df(query=query_check_table) 10932 10933 # Check if transcripts table exists 10934 if len(df_check_table) > 0 and not transcripts_table_drop: 10935 log.debug(f"Table {transcripts_table} exists and not drop option") 10936 return transcripts_table 10937 10938 if struct: 10939 10940 # added_columns 10941 added_columns = [] 10942 10943 # Temporary tables 10944 temporary_tables = [] 10945 10946 # Annotation fields 10947 annotation_fields = [] 10948 10949 # from columns map 10950 columns_maps = struct.get("from_columns_map", []) 10951 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10952 self.create_transcript_view_from_columns_map( 10953 transcripts_table=transcripts_table, 10954 columns_maps=columns_maps, 10955 added_columns=added_columns, 10956 temporary_tables=temporary_tables, 10957 annotation_fields=annotation_fields, 10958 ) 10959 ) 10960 added_columns += added_columns_tmp 10961 temporary_tables += temporary_tables_tmp 10962 annotation_fields += annotation_fields_tmp 10963 10964 # from column format 10965 column_formats = struct.get("from_column_format", []) 10966 temporary_tables_tmp, annotation_fields_tmp = ( 10967 self.create_transcript_view_from_column_format( 10968 transcripts_table=transcripts_table, 10969 column_formats=column_formats, 10970 temporary_tables=temporary_tables, 10971 annotation_fields=annotation_fields, 10972 ) 10973 ) 10974 temporary_tables += temporary_tables_tmp 10975 annotation_fields += annotation_fields_tmp 10976 10977 # Remove some specific fields/column 10978 annotation_fields = list(set(annotation_fields)) 10979 for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]: 10980 if field in annotation_fields: 10981 annotation_fields.remove(field) 10982 10983 # Merge temporary tables query 10984 query_merge = "" 10985 for temporary_table in list(set(temporary_tables)): 10986 10987 # First temporary table 10988 if not query_merge: 10989 query_merge = f""" 10990 SELECT * FROM {temporary_table} 10991 """ 10992 # other temporary table (using UNION) 10993 else: 10994 query_merge += f""" 10995 UNION BY NAME SELECT * FROM {temporary_table} 10996 """ 10997 10998 # transcript table tmp 10999 transcript_table_tmp = "transcripts_tmp" 11000 transcript_table_tmp2 = "transcripts_tmp2" 11001 transcript_table_tmp3 = "transcripts_tmp3" 11002 11003 # Merge on transcript 11004 query_merge_on_transcripts_annotation_fields = [] 11005 11006 # Add transcript list 11007 query_merge_on_transcripts_annotation_fields.append( 11008 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """ 11009 ) 11010 11011 # Aggregate all annotations fields 11012 for annotation_field in set(annotation_fields): 11013 query_merge_on_transcripts_annotation_fields.append( 11014 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """ 11015 ) 11016 11017 # Transcripts mapping 11018 if transcript_id_mapping_file: 11019 11020 # Transcript dataframe 11021 transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe" 11022 transcript_id_mapping_dataframe = transcripts_file_to_df( 11023 transcript_id_mapping_file, column_names=["transcript", "alias"] 11024 ) 11025 11026 # Transcript version remove 11027 if transcript_id_remove_version: 11028 query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped" 11029 query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)" 11030 query_left_join = f""" 11031 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 11032 """ 11033 else: 11034 query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped" 11035 query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript" 11036 query_left_join = f""" 11037 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 11038 """ 11039 11040 # Transcript column for group by merge 11041 query_transcript_merge_group_by = """ 11042 CASE 11043 WHEN transcript_mapped NOT IN ('') 11044 THEN split_part(transcript_mapped, '.', 1) 11045 ELSE split_part(transcript_original, '.', 1) 11046 END 11047 """ 11048 11049 # Merge query 11050 transcripts_tmp2_query = f""" 11051 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)} 11052 FROM ({query_merge}) AS {transcript_table_tmp} 11053 {query_left_join} 11054 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by} 11055 """ 11056 11057 # Retrive columns after mege 11058 transcripts_tmp2_describe_query = f""" 11059 DESCRIBE {transcripts_tmp2_query} 11060 """ 11061 transcripts_tmp2_describe_list = list( 11062 self.get_query_to_df(query=transcripts_tmp2_describe_query)[ 11063 "column_name" 11064 ] 11065 ) 11066 11067 # Create list of columns for select clause 11068 transcripts_tmp2_describe_select_clause = [] 11069 for field in transcripts_tmp2_describe_list: 11070 if field not in [ 11071 "#CHROM", 11072 "POS", 11073 "REF", 11074 "ALT", 11075 "INFO", 11076 "transcript_mapped", 11077 ]: 11078 as_field = field 11079 if field in ["transcript_original"]: 11080 as_field = "transcripts_mapped" 11081 transcripts_tmp2_describe_select_clause.append( 11082 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """ 11083 ) 11084 11085 # Merge with mapping 11086 query_merge_on_transcripts = f""" 11087 SELECT 11088 "#CHROM", POS, REF, ALT, INFO, 11089 CASE 11090 WHEN ANY_VALUE(transcript_mapped) NOT IN ('') 11091 THEN ANY_VALUE(transcript_mapped) 11092 ELSE ANY_VALUE(transcript_original) 11093 END AS transcript, 11094 {", ".join(transcripts_tmp2_describe_select_clause)} 11095 FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2} 11096 GROUP BY "#CHROM", POS, REF, ALT, INFO, 11097 {query_transcript_merge_group_by} 11098 """ 11099 11100 # Add transcript filter from mapping file 11101 if transcript_id_mapping_force: 11102 query_merge_on_transcripts = f""" 11103 SELECT * 11104 FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3} 11105 WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe) 11106 """ 11107 11108 # No transcript mapping 11109 else: 11110 11111 # Remove transcript version 11112 if transcript_id_remove_version: 11113 query_transcript_column = f""" 11114 split_part({transcript_table_tmp}.transcript, '.', 1) 11115 """ 11116 else: 11117 query_transcript_column = """ 11118 transcript 11119 """ 11120 11121 # Query sections 11122 query_transcript_column_select = ( 11123 f"{query_transcript_column} AS transcript" 11124 ) 11125 query_transcript_column_group_by = query_transcript_column 11126 11127 # Query for transcripts view 11128 query_merge_on_transcripts = f""" 11129 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)} 11130 FROM ({query_merge}) AS {transcript_table_tmp} 11131 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} 11132 """ 11133 11134 # Drop transcript view is necessary 11135 if transcripts_table_drop: 11136 query_drop = f""" 11137 DROP TABLE IF EXISTS {transcripts_table}; 11138 """ 11139 self.execute_query(query=query_drop) 11140 11141 # # Merge and create transcript view 11142 # query_create_view = f""" 11143 # CREATE TABLE IF NOT EXISTS {transcripts_table} 11144 # AS {query_merge_on_transcripts} 11145 # """ 11146 # self.execute_query(query=query_create_view) 11147 11148 # Using #CHROM chunk 11149 ###### 11150 11151 # List of unique #CHROM 11152 query_unique_chrom = f""" 11153 SELECT DISTINCT "#CHROM" 11154 FROM variants AS subquery 11155 """ 11156 unique_chroms = self.get_query_to_df(query=query_unique_chrom) 11157 11158 # Create table with structure but without data, if not exists 11159 query_create_table = f""" 11160 CREATE TABLE IF NOT EXISTS {transcripts_table} AS 11161 SELECT * FROM ({query_merge_on_transcripts}) AS subquery LIMIT 0 11162 """ 11163 self.execute_query(query=query_create_table) 11164 11165 # Process by #CHROM 11166 for chrom in unique_chroms["#CHROM"]: 11167 11168 # Log 11169 log.debug(f"Processing #CHROM={chrom}") 11170 11171 # Select data by #CHROM 11172 query_chunk = f""" 11173 SELECT * 11174 FROM ({query_merge_on_transcripts}) 11175 WHERE "#CHROM" = '{chrom}' 11176 """ 11177 11178 # Insert data 11179 query_insert_chunk = f""" 11180 INSERT INTO {transcripts_table} 11181 {query_chunk} 11182 """ 11183 self.execute_query(query=query_insert_chunk) 11184 11185 # Remove temporary tables 11186 if temporary_tables: 11187 for temporary_table in list(set(temporary_tables)): 11188 query_drop_tmp_table = f""" 11189 DROP TABLE IF EXISTS {temporary_table} 11190 """ 11191 self.execute_query(query=query_drop_tmp_table) 11192 11193 # Remove added columns 11194 for added_column in added_columns: 11195 self.drop_column(column=added_column) 11196 11197 else: 11198 11199 transcripts_table = None 11200 11201 return transcripts_table 11202 11203 def annotation_format_to_table( 11204 self, 11205 uniquify: bool = True, 11206 annotation_field: str = "ANN", 11207 annotation_id: str = "Feature_ID", 11208 view_name: str = "transcripts", 11209 column_rename: dict = {}, 11210 column_clean: bool = False, 11211 column_case: str = None, 11212 ) -> str: 11213 """ 11214 The `annotation_format_to_table` function converts annotation data from a VCF file into a 11215 structured table format, ensuring unique values and creating a temporary table for further 11216 processing or analysis. 11217 11218 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure 11219 unique values in the output or not. If set to `True`, the function will make sure that the 11220 output values are unique, defaults to True 11221 :type uniquify: bool (optional) 11222 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file 11223 that contains the annotation information for each variant. This field is used to extract the 11224 annotation details for further processing in the function. By default, it is set to "ANN", 11225 defaults to ANN 11226 :type annotation_field: str (optional) 11227 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method 11228 is used to specify the identifier for the annotation feature. This identifier will be used as a 11229 column name in the resulting table or view that is created based on the annotation data. It 11230 helps in uniquely identifying each annotation entry in the, defaults to Feature_ID 11231 :type annotation_id: str (optional) 11232 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used 11233 to specify the name of the temporary table that will be created to store the transformed 11234 annotation data. This table will hold the extracted information from the annotation field in a 11235 structured format for further processing or analysis. By default,, defaults to transcripts 11236 :type view_name: str (optional) 11237 :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method 11238 is a dictionary that allows you to specify custom renaming for columns. By providing key-value 11239 pairs in this dictionary, you can rename specific columns in the resulting table or view that is 11240 created based on the annotation data. This feature enables 11241 :type column_rename: dict 11242 :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is 11243 a boolean flag that determines whether the annotation field should undergo a cleaning process. 11244 If set to `True`, the function will clean the annotation field before further processing. This 11245 cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults 11246 to False 11247 :type column_clean: bool (optional) 11248 :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is 11249 used to specify the case transformation to be applied to the column names extracted from the 11250 annotation data. It allows you to set the case of the column names to either lowercase or 11251 uppercase for consistency or other specific requirements during the conversion 11252 :type column_case: str 11253 :return: The function `annotation_format_to_table` is returning the name of the view created, 11254 which is stored in the variable `view_name`. 11255 """ 11256 11257 # Annotation field 11258 annotation_format = "annotation_explode" 11259 11260 # Transcript annotation 11261 if column_rename: 11262 annotation_id = column_rename.get(annotation_id, annotation_id) 11263 11264 if column_clean: 11265 annotation_id = clean_annotation_field(annotation_id) 11266 11267 # Prefix 11268 prefix = self.get_explode_infos_prefix() 11269 if prefix: 11270 prefix = "INFO/" 11271 11272 # Annotation fields 11273 annotation_infos = prefix + annotation_field 11274 annotation_format_infos = prefix + annotation_format 11275 11276 # Variants table 11277 table_variants = self.get_table_variants() 11278 11279 # Header 11280 vcf_reader = self.get_header() 11281 11282 # Add columns 11283 added_columns = [] 11284 11285 # Explode HGVS field in column 11286 added_columns += self.explode_infos(fields=[annotation_field]) 11287 11288 if annotation_field in vcf_reader.infos: 11289 11290 # Extract ANN header 11291 ann_description = vcf_reader.infos[annotation_field].desc 11292 pattern = r"'(.+?)'" 11293 match = re.search(pattern, ann_description) 11294 if match: 11295 ann_header_match = match.group(1).split(" | ") 11296 ann_header = [] 11297 ann_header_desc = {} 11298 for i in range(len(ann_header_match)): 11299 ann_header_info = "".join( 11300 char for char in ann_header_match[i] if char.isalnum() 11301 ) 11302 ann_header.append(ann_header_info) 11303 ann_header_desc[ann_header_info] = ann_header_match[i] 11304 if not ann_header_desc: 11305 raise ValueError("Invalid header description format") 11306 else: 11307 raise ValueError("Invalid header description format") 11308 11309 # Create variant id 11310 variant_id_column = self.get_variant_id_column() 11311 added_columns += [variant_id_column] 11312 11313 # Get list of #CHROM 11314 query_unique_chrom = f""" 11315 SELECT DISTINCT "#CHROM" 11316 FROM variants AS subquery 11317 """ 11318 unique_chroms = self.get_query_to_df(query=query_unique_chrom) 11319 11320 # Base for database anontation format 11321 dataframe_annotation_format_base = f""" 11322 SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" 11323 FROM {table_variants} 11324 """ 11325 11326 # Create dataframe for keys column type 11327 dataframe_annotation_format = self.get_query_to_df( 11328 f""" {dataframe_annotation_format_base} LIMIT 1000 """ 11329 ) 11330 11331 # Define a vectorized function to apply explode_annotation_format 11332 vectorized_explode_annotation_format = np.vectorize( 11333 lambda x: explode_annotation_format( 11334 annotation=str(x), 11335 uniquify=uniquify, 11336 output_format="JSON", 11337 prefix="", 11338 header=list(ann_header_desc.values()), 11339 ) 11340 ) 11341 11342 # Assign the exploded annotations back to the dataframe 11343 dataframe_annotation_format[annotation_format_infos] = ( 11344 vectorized_explode_annotation_format( 11345 dataframe_annotation_format[annotation_infos].to_numpy() 11346 ) 11347 ) 11348 11349 # Find keys 11350 query_json = f""" 11351 SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' 11352 FROM dataframe_annotation_format; 11353 """ 11354 df_keys = self.get_query_to_df(query=query_json) 11355 11356 # Check keys 11357 query_json_key = [] 11358 for _, row in df_keys.iterrows(): 11359 11360 # Key 11361 key = row.iloc[0] 11362 key_clean = key 11363 11364 # key rename 11365 if column_rename: 11366 key_clean = column_rename.get(key_clean, key_clean) 11367 11368 # key clean 11369 if column_clean: 11370 key_clean = clean_annotation_field(key_clean) 11371 11372 # Key case 11373 if column_case: 11374 if column_case.lower() in ["lower"]: 11375 key_clean = key_clean.lower() 11376 elif column_case.lower() in ["upper"]: 11377 key_clean = key_clean.upper() 11378 11379 # Type 11380 query_json_type = f""" 11381 SELECT * 11382 FROM ( 11383 SELECT 11384 NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '') AS '{key_clean}' 11385 FROM 11386 dataframe_annotation_format 11387 ) 11388 WHERE "{key_clean}" NOT NULL AND "{key_clean}" NOT IN ('') 11389 """ 11390 11391 # Get DataFrame from query 11392 df_json_type = self.get_query_to_df(query=query_json_type) 11393 11394 # Detect column type 11395 column_type = detect_column_type(df_json_type[key_clean]) 11396 11397 # Free up memory 11398 del df_json_type 11399 11400 # Append 11401 query_json_key.append( 11402 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 11403 ) 11404 11405 # Create table with structure but without data, if not exists 11406 query_create_table = f""" 11407 CREATE TABLE IF NOT EXISTS {view_name} 11408 AS ( 11409 SELECT *, {annotation_id} AS 'transcript' 11410 FROM ( 11411 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 11412 FROM dataframe_annotation_format 11413 ) 11414 LIMIT 0 11415 ); 11416 """ 11417 self.execute_query(query=query_create_table) 11418 11419 # Free up memory 11420 del dataframe_annotation_format 11421 11422 # Insert data by chromosome 11423 for chrom in unique_chroms["#CHROM"]: 11424 11425 # Log 11426 log.debug(f"Processing #CHROM={chrom}") 11427 11428 # Create dataframe 11429 dataframe_annotation_format = self.get_query_to_df( 11430 f""" {dataframe_annotation_format_base} WHERE "#CHROM" = '{chrom}' """ 11431 ) 11432 11433 # Define a vectorized function to apply explode_annotation_format 11434 vectorized_explode_annotation_format = np.vectorize( 11435 lambda x: explode_annotation_format( 11436 annotation=str(x), 11437 uniquify=uniquify, 11438 output_format="JSON", 11439 prefix="", 11440 header=list(ann_header_desc.values()), 11441 ) 11442 ) 11443 11444 # Assign the exploded annotations back to the dataframe 11445 dataframe_annotation_format[annotation_format_infos] = ( 11446 vectorized_explode_annotation_format( 11447 dataframe_annotation_format[annotation_infos].to_numpy() 11448 ) 11449 ) 11450 11451 # Insert data into tmp table 11452 query_insert_chunk = f""" 11453 INSERT INTO {view_name} 11454 SELECT *, {annotation_id} AS 'transcript' 11455 FROM ( 11456 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 11457 FROM dataframe_annotation_format 11458 ) 11459 """ 11460 self.execute_query(query=query_insert_chunk) 11461 11462 # Free up memory 11463 del dataframe_annotation_format 11464 11465 else: 11466 11467 # Return None 11468 view_name = None 11469 11470 # Remove added columns 11471 for added_column in added_columns: 11472 self.drop_column(column=added_column) 11473 11474 return view_name 11475 11476 def transcript_view_to_variants( 11477 self, 11478 transcripts_table: str = None, 11479 transcripts_column_id: str = None, 11480 transcripts_info_json: str = None, 11481 transcripts_info_field_json: str = None, 11482 transcripts_info_format: str = None, 11483 transcripts_info_field_format: str = None, 11484 param: dict = {}, 11485 ) -> bool: 11486 """ 11487 The `transcript_view_to_variants` function updates a variants table with information from 11488 transcripts in JSON format. 11489 11490 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 11491 table containing the transcripts data. If this parameter is not provided, the function will 11492 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 11493 :type transcripts_table: str 11494 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 11495 column in the `transcripts_table` that contains the unique identifier for each transcript. This 11496 identifier is used to match transcripts with variants in the database 11497 :type transcripts_column_id: str 11498 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 11499 of the column in the variants table where the transcripts information will be stored in JSON 11500 format. This parameter allows you to define the column in the variants table that will hold the 11501 JSON-formatted information about transcripts 11502 :type transcripts_info_json: str 11503 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 11504 specify the field in the VCF header that will contain information about transcripts in JSON 11505 format. This field will be added to the VCF header as an INFO field with the specified name 11506 :type transcripts_info_field_json: str 11507 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 11508 format of the information about transcripts that will be stored in the variants table. This 11509 format can be used to define how the transcript information will be structured or displayed 11510 within the variants table 11511 :type transcripts_info_format: str 11512 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 11513 specify the field in the VCF header that will contain information about transcripts in a 11514 specific format. This field will be added to the VCF header as an INFO field with the specified 11515 name 11516 :type transcripts_info_field_format: str 11517 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 11518 that contains various configuration settings related to transcripts. It is used to provide 11519 default values for certain parameters if they are not explicitly provided when calling the 11520 method. The `param` dictionary can be passed as an argument 11521 :type param: dict 11522 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 11523 if the operation is successful and `False` if certain conditions are not met. 11524 """ 11525 11526 msg_info_prefix = "Start transcripts view to variants annotations" 11527 11528 log.debug(f"{msg_info_prefix}...") 11529 11530 # Default 11531 transcripts_table_default = "transcripts" 11532 transcripts_column_id_default = "transcript" 11533 transcripts_info_json_default = None 11534 transcripts_info_format_default = None 11535 transcripts_info_field_json_default = None 11536 transcripts_info_field_format_default = None 11537 11538 # Param 11539 if not param: 11540 param = self.get_param() 11541 11542 # Transcripts table 11543 if transcripts_table is None: 11544 transcripts_table = param.get("transcripts", {}).get( 11545 "table", transcripts_table_default 11546 ) 11547 11548 # Transcripts column ID 11549 if transcripts_column_id is None: 11550 transcripts_column_id = param.get("transcripts", {}).get( 11551 "column_id", transcripts_column_id_default 11552 ) 11553 11554 # Transcripts info json 11555 if transcripts_info_json is None: 11556 transcripts_info_json = param.get("transcripts", {}).get( 11557 "transcripts_info_json", transcripts_info_json_default 11558 ) 11559 11560 # Transcripts info field JSON 11561 if transcripts_info_field_json is None: 11562 transcripts_info_field_json = param.get("transcripts", {}).get( 11563 "transcripts_info_field_json", transcripts_info_field_json_default 11564 ) 11565 # if transcripts_info_field_json is not None and transcripts_info_json is None: 11566 # transcripts_info_json = transcripts_info_field_json 11567 11568 # Transcripts info format 11569 if transcripts_info_format is None: 11570 transcripts_info_format = param.get("transcripts", {}).get( 11571 "transcripts_info_format", transcripts_info_format_default 11572 ) 11573 11574 # Transcripts info field FORMAT 11575 if transcripts_info_field_format is None: 11576 transcripts_info_field_format = param.get("transcripts", {}).get( 11577 "transcripts_info_field_format", transcripts_info_field_format_default 11578 ) 11579 # if ( 11580 # transcripts_info_field_format is not None 11581 # and transcripts_info_format is None 11582 # ): 11583 # transcripts_info_format = transcripts_info_field_format 11584 11585 # Variants table 11586 table_variants = self.get_table_variants() 11587 11588 # Check info columns param 11589 if ( 11590 transcripts_info_json is None 11591 and transcripts_info_field_json is None 11592 and transcripts_info_format is None 11593 and transcripts_info_field_format is None 11594 ): 11595 return False 11596 11597 # Transcripts infos columns 11598 query_transcripts_infos_columns = f""" 11599 SELECT * 11600 FROM ( 11601 DESCRIBE SELECT * FROM {transcripts_table} 11602 ) 11603 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 11604 """ 11605 transcripts_infos_columns = list( 11606 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 11607 ) 11608 11609 # View results 11610 clause_select = [] 11611 clause_to_json = [] 11612 clause_to_format = [] 11613 for field in transcripts_infos_columns: 11614 # Do not consider INFO field for export into fields 11615 if field not in ["INFO"]: 11616 clause_select.append( 11617 f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """ 11618 ) 11619 clause_to_json.append(f""" '{field}': "{field}" """) 11620 clause_to_format.append(f""" "{field}" """) 11621 11622 # Update 11623 update_set_json = [] 11624 update_set_format = [] 11625 11626 # VCF header 11627 vcf_reader = self.get_header() 11628 11629 # Transcripts to info column in JSON 11630 if transcripts_info_json: 11631 11632 # Create column on variants table 11633 self.add_column( 11634 table_name=table_variants, 11635 column_name=transcripts_info_json, 11636 column_type="JSON", 11637 default_value=None, 11638 drop=False, 11639 ) 11640 11641 # Add header 11642 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 11643 transcripts_info_json, 11644 ".", 11645 "String", 11646 "Transcripts in JSON format", 11647 "unknwon", 11648 "unknwon", 11649 self.code_type_map["String"], 11650 ) 11651 11652 # Add to update 11653 update_set_json.append( 11654 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 11655 ) 11656 11657 # Transcripts to info field in JSON 11658 if transcripts_info_field_json: 11659 11660 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 11661 11662 # Add to update 11663 update_set_json.append( 11664 f""" 11665 INFO = concat( 11666 CASE 11667 WHEN INFO NOT IN ('', '.') 11668 THEN INFO 11669 ELSE '' 11670 END, 11671 CASE 11672 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 11673 THEN concat( 11674 ';{transcripts_info_field_json}=', 11675 t.{transcripts_info_json} 11676 ) 11677 ELSE '' 11678 END 11679 ) 11680 """ 11681 ) 11682 11683 # Add header 11684 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 11685 transcripts_info_field_json, 11686 ".", 11687 "String", 11688 "Transcripts in JSON format", 11689 "unknwon", 11690 "unknwon", 11691 self.code_type_map["String"], 11692 ) 11693 11694 if update_set_json: 11695 11696 # Update query 11697 query_update = f""" 11698 UPDATE {table_variants} 11699 SET {", ".join(update_set_json)} 11700 FROM 11701 ( 11702 SELECT 11703 "#CHROM", POS, REF, ALT, 11704 concat( 11705 '{{', 11706 string_agg( 11707 '"' || "{transcripts_column_id}" || '":' || 11708 to_json(json_output) 11709 ), 11710 '}}' 11711 )::JSON AS {transcripts_info_json} 11712 FROM 11713 ( 11714 SELECT 11715 "#CHROM", POS, REF, ALT, 11716 "{transcripts_column_id}", 11717 to_json( 11718 {{{",".join(clause_to_json)}}} 11719 )::JSON AS json_output 11720 FROM 11721 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11722 WHERE "{transcripts_column_id}" IS NOT NULL 11723 ) 11724 GROUP BY "#CHROM", POS, REF, ALT 11725 ) AS t 11726 WHERE {table_variants}."#CHROM" = t."#CHROM" 11727 AND {table_variants}."POS" = t."POS" 11728 AND {table_variants}."REF" = t."REF" 11729 AND {table_variants}."ALT" = t."ALT" 11730 """ 11731 11732 self.execute_query(query=query_update) 11733 11734 # Transcripts to info column in FORMAT 11735 if transcripts_info_format: 11736 11737 # Create column on variants table 11738 self.add_column( 11739 table_name=table_variants, 11740 column_name=transcripts_info_format, 11741 column_type="VARCHAR", 11742 default_value=None, 11743 drop=False, 11744 ) 11745 11746 # Add header 11747 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 11748 transcripts_info_format, 11749 ".", 11750 "String", 11751 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11752 "unknwon", 11753 "unknwon", 11754 self.code_type_map["String"], 11755 ) 11756 11757 # Add to update 11758 update_set_format.append( 11759 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 11760 ) 11761 11762 else: 11763 11764 # Set variable for internal queries 11765 transcripts_info_format = "transcripts_info_format" 11766 11767 # Transcripts to info field in JSON 11768 if transcripts_info_field_format: 11769 11770 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 11771 11772 # Add to update 11773 update_set_format.append( 11774 f""" 11775 INFO = concat( 11776 CASE 11777 WHEN INFO NOT IN ('', '.') 11778 THEN INFO 11779 ELSE '' 11780 END, 11781 CASE 11782 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 11783 THEN concat( 11784 ';{transcripts_info_field_format}=', 11785 t.{transcripts_info_format} 11786 ) 11787 ELSE '' 11788 END 11789 ) 11790 """ 11791 ) 11792 11793 # Add header 11794 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 11795 transcripts_info_field_format, 11796 ".", 11797 "String", 11798 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11799 "unknwon", 11800 "unknwon", 11801 self.code_type_map["String"], 11802 ) 11803 11804 if update_set_format: 11805 11806 # Update query 11807 query_update = f""" 11808 UPDATE {table_variants} 11809 SET {", ".join(update_set_format)} 11810 FROM 11811 ( 11812 SELECT 11813 "#CHROM", POS, REF, ALT, 11814 string_agg({transcripts_info_format}) AS {transcripts_info_format} 11815 FROM 11816 ( 11817 SELECT 11818 "#CHROM", POS, REF, ALT, 11819 "{transcripts_column_id}", 11820 concat( 11821 "{transcripts_column_id}", 11822 '|', 11823 {", '|', ".join(clause_to_format)} 11824 ) AS {transcripts_info_format} 11825 FROM 11826 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11827 ) 11828 GROUP BY "#CHROM", POS, REF, ALT 11829 ) AS t 11830 WHERE {table_variants}."#CHROM" = t."#CHROM" 11831 AND {table_variants}."POS" = t."POS" 11832 AND {table_variants}."REF" = t."REF" 11833 AND {table_variants}."ALT" = t."ALT" 11834 """ 11835 11836 self.execute_query(query=query_update) 11837 11838 return True 11839 11840 def rename_info_fields( 11841 self, fields_to_rename: dict = None, table: str = None 11842 ) -> dict: 11843 """ 11844 The `rename_info_fields` function renames specified fields in a VCF file header and updates 11845 corresponding INFO fields in the variants table. 11846 11847 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the 11848 mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary 11849 represent the original field names that need to be renamed, and the corresponding values 11850 represent the new names to which the fields should be 11851 :type fields_to_rename: dict 11852 :param table: The `table` parameter in the `rename_info_fields` function represents the name of 11853 the table in which the variants data is stored. This table contains information about genetic 11854 variants, and the function updates the corresponding INFO fields in this table when renaming 11855 specified fields in the VCF file header 11856 :type table: str 11857 :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains 11858 the original field names as keys and their corresponding new names (or None if the field was 11859 removed) as values after renaming or removing specified fields in a VCF file header and updating 11860 corresponding INFO fields in the variants table. 11861 """ 11862 11863 # Init 11864 fields_renamed = {} 11865 config = self.get_config() 11866 access = config.get("access") 11867 11868 if table is None: 11869 table = self.get_table_variants() 11870 11871 # regexp replace fonction 11872 regex_replace_dict = {} 11873 regex_replace_nb = 0 11874 regex_replace_partition = 125 11875 regex_replace = "concat(INFO, ';')" # Add ';' to reduce regexp comlexity 11876 11877 if fields_to_rename is not None and access not in ["RO"]: 11878 11879 log.info("Rename or remove fields...") 11880 11881 # Header 11882 header = self.get_header() 11883 11884 for field_to_rename, field_renamed in fields_to_rename.items(): 11885 11886 if field_to_rename in header.infos: 11887 11888 # Rename header 11889 if field_renamed is not None: 11890 header.infos[field_renamed] = vcf.parser._Info( 11891 field_renamed, 11892 header.infos[field_to_rename].num, 11893 header.infos[field_to_rename].type, 11894 header.infos[field_to_rename].desc, 11895 header.infos[field_to_rename].source, 11896 header.infos[field_to_rename].version, 11897 header.infos[field_to_rename].type_code, 11898 ) 11899 del header.infos[field_to_rename] 11900 11901 # Rename INFO patterns 11902 field_pattern = rf"(^|;)({field_to_rename})(=[^;]*)?;" 11903 if field_renamed is not None: 11904 field_renamed_pattern = rf"\1{field_renamed}\3;" 11905 else: 11906 field_renamed_pattern = r"\1" 11907 11908 # regexp replace 11909 regex_replace_nb += 1 11910 regex_replace_key = math.floor( 11911 regex_replace_nb / regex_replace_partition 11912 ) 11913 if (regex_replace_nb % regex_replace_partition) == 0: 11914 regex_replace = "concat(INFO, ';')" 11915 regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')" 11916 regex_replace_dict[regex_replace_key] = regex_replace 11917 11918 # Return 11919 fields_renamed[field_to_rename] = field_renamed 11920 11921 # Log 11922 if field_renamed is not None: 11923 log.info( 11924 f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'" 11925 ) 11926 else: 11927 log.info( 11928 f"Rename or remove fields - field '{field_to_rename}' removed" 11929 ) 11930 11931 else: 11932 11933 log.warning( 11934 f"Rename or remove fields - field '{field_to_rename}' not in header" 11935 ) 11936 11937 # Rename INFO 11938 for regex_replace_key, regex_replace in regex_replace_dict.items(): 11939 log.info( 11940 f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]..." 11941 ) 11942 query = f""" 11943 UPDATE {table} 11944 SET 11945 INFO = regexp_replace({regex_replace}, ';$', '') 11946 """ 11947 log.debug(f"query={query}") 11948 self.execute_query(query=query) 11949 11950 return fields_renamed 11951 11952 def calculation_rename_info_fields( 11953 self, 11954 fields_to_rename: dict = None, 11955 table: str = None, 11956 operation_name: str = "RENAME_INFO_FIELDS", 11957 ) -> None: 11958 """ 11959 The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates 11960 fields to rename and table if provided, and then calls another function to rename the fields. 11961 11962 :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be 11963 renamed in a table. Each key-value pair in the dictionary represents the original field name as 11964 the key and the new field name as the value 11965 :type fields_to_rename: dict 11966 :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to 11967 specify the name of the table for which the fields are to be renamed. It is a string type 11968 parameter 11969 :type table: str 11970 :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields` 11971 method is a string that specifies the name of the operation being performed. In this context, it 11972 is used as a default value for the operation name if not explicitly provided when calling the 11973 function, defaults to RENAME_INFO_FIELDS 11974 :type operation_name: str (optional) 11975 """ 11976 11977 # Param 11978 param = self.get_param() 11979 11980 # Get param fields to rename 11981 param_fields_to_rename = ( 11982 param.get("calculation", {}) 11983 .get("calculations", {}) 11984 .get(operation_name, {}) 11985 .get("fields_to_rename", None) 11986 ) 11987 11988 # Get param table 11989 param_table = ( 11990 param.get("calculation", {}) 11991 .get("calculations", {}) 11992 .get(operation_name, {}) 11993 .get("table", None) 11994 ) 11995 11996 # Init fields_to_rename 11997 if fields_to_rename is None: 11998 fields_to_rename = param_fields_to_rename 11999 12000 # Init table 12001 if table is None: 12002 table = param_table 12003 12004 renamed_fields = self.rename_info_fields( 12005 fields_to_rename=fields_to_rename, table=table 12006 ) 12007 12008 log.debug(f"renamed_fields:{renamed_fields}")
39 def __init__( 40 self, 41 conn=None, 42 input: str = None, 43 output: str = None, 44 config: dict = {}, 45 param: dict = {}, 46 load: bool = False, 47 ) -> None: 48 """ 49 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 50 header 51 52 :param conn: the connection to the database 53 :param input: the input file 54 :param output: the output file 55 :param config: a dictionary containing the configuration of the model 56 :param param: a dictionary containing the parameters of the model 57 """ 58 59 # Init variables 60 self.init_variables() 61 62 # Input 63 self.set_input(input) 64 65 # Config 66 self.set_config(config) 67 68 # Param 69 self.set_param(param) 70 71 # Output 72 self.set_output(output) 73 74 # connexion 75 self.set_connexion(conn) 76 77 # Header 78 self.set_header() 79 80 # Samples 81 self.set_samples() 82 83 # Load data 84 if load: 85 self.load_data()
The function __init__ initializes the variables, sets the input, output, config, param, connexion and
header
Parameters
- conn: the connection to the database
- input: the input file
- output: the output file
- config: a dictionary containing the configuration of the model
- param: a dictionary containing the parameters of the model
87 def set_samples(self, samples: list = None) -> list: 88 """ 89 The function `set_samples` sets the samples attribute of an object to a provided list or 90 retrieves it from a parameter dictionary. 91 92 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 93 input and sets the `samples` attribute of the class to the provided list. If no samples are 94 provided, it tries to get the samples from the class's parameters using the `get_param` method 95 :type samples: list 96 :return: The `samples` list is being returned. 97 """ 98 99 if not samples: 100 samples = self.get_param().get("samples", {}).get("list", None) 101 102 self.samples = samples 103 104 return samples
The function set_samples sets the samples attribute of an object to a provided list or
retrieves it from a parameter dictionary.
Parameters
- samples: The
set_samplesmethod is a method of a class that takes a list of samples as input and sets thesamplesattribute of the class to the provided list. If no samples are provided, it tries to get the samples from the class's parameters using theget_parammethod
Returns
The
sampleslist is being returned.
106 def get_samples(self) -> list: 107 """ 108 This function returns a list of samples. 109 :return: The `get_samples` method is returning the `samples` attribute of the object. 110 """ 111 112 return self.samples
This function returns a list of samples.
Returns
The
get_samplesmethod is returning thesamplesattribute of the object.
114 def get_samples_check(self) -> bool: 115 """ 116 This function returns the value of the "check" key within the "samples" dictionary retrieved 117 from the parameters. 118 :return: The method `get_samples_check` is returning the value of the key "check" inside the 119 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 120 method. If the key "check" is not found, it will return `False`. 121 """ 122 123 return self.get_param().get("samples", {}).get("check", True)
This function returns the value of the "check" key within the "samples" dictionary retrieved from the parameters.
Returns
The method
get_samples_checkis returning the value of the key "check" inside the "samples" dictionary, which is nested inside the dictionary returned by theget_param()method. If the key "check" is not found, it will returnFalse.
125 def set_input(self, input: str = None) -> None: 126 """ 127 The function `set_input` takes a file name as input, extracts the name and extension, and sets 128 attributes in the class accordingly. 129 130 :param input: The `set_input` method in the provided code snippet is used to set attributes 131 related to the input file. Here's a breakdown of the parameters and their usage in the method: 132 :type input: str 133 """ 134 135 if input and not isinstance(input, str): 136 try: 137 self.input = input.name 138 except: 139 log.error(f"Input file '{input} in bad format") 140 raise ValueError(f"Input file '{input} in bad format") 141 else: 142 self.input = input 143 144 # Input format 145 if input: 146 input_name, input_extension = os.path.splitext(self.input) 147 self.input_name = input_name 148 self.input_extension = input_extension 149 self.input_format = self.input_extension.replace(".", "")
The function set_input takes a file name as input, extracts the name and extension, and sets
attributes in the class accordingly.
Parameters
- input: The
set_inputmethod in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
151 def set_config(self, config: dict) -> None: 152 """ 153 The set_config function takes a config object and assigns it as the configuration object for the 154 class. 155 156 :param config: The `config` parameter in the `set_config` function is a dictionary object that 157 contains configuration settings for the class. When you call the `set_config` function with a 158 dictionary object as the argument, it will set that dictionary as the configuration object for 159 the class 160 :type config: dict 161 """ 162 163 self.config = config
The set_config function takes a config object and assigns it as the configuration object for the class.
Parameters
- config: The
configparameter in theset_configfunction is a dictionary object that contains configuration settings for the class. When you call theset_configfunction with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
165 def set_param(self, param: dict) -> None: 166 """ 167 This function sets a parameter object for the class based on the input dictionary. 168 169 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 170 as the `param` attribute of the class instance 171 :type param: dict 172 """ 173 174 self.param = param
This function sets a parameter object for the class based on the input dictionary.
Parameters
- param: The
set_parammethod you provided takes a dictionary object as input and sets it as theparamattribute of the class instance
176 def init_variables(self) -> None: 177 """ 178 This function initializes the variables that will be used in the rest of the class 179 """ 180 181 self.prefix = "howard" 182 self.table_variants = "variants" 183 self.dataframe = None 184 185 self.comparison_map = { 186 "gt": ">", 187 "gte": ">=", 188 "lt": "<", 189 "lte": "<=", 190 "equals": "=", 191 "contains": "SIMILAR TO", 192 } 193 194 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 195 196 self.code_type_map_to_sql = { 197 "Integer": "INTEGER", 198 "String": "VARCHAR", 199 "Float": "FLOAT", 200 "Flag": "VARCHAR", 201 } 202 203 self.index_additionnal_fields = []
This function initializes the variables that will be used in the rest of the class
205 def get_indexing(self) -> bool: 206 """ 207 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 208 returns False. 209 :return: The value of the indexing parameter. 210 """ 211 212 return self.get_param().get("indexing", False)
It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.
Returns
The value of the indexing parameter.
214 def get_connexion_config(self) -> dict: 215 """ 216 The function `get_connexion_config` returns a dictionary containing the configuration for a 217 connection, including the number of threads and memory limit. 218 :return: a dictionary containing the configuration for the Connexion library. 219 """ 220 221 # config 222 config = self.get_config() 223 224 # Connexion config 225 connexion_config = {} 226 threads = self.get_threads() 227 228 # Threads 229 if threads: 230 connexion_config["threads"] = threads 231 232 # Memory 233 # if config.get("memory", None): 234 # connexion_config["memory_limit"] = config.get("memory") 235 if self.get_memory(): 236 connexion_config["memory_limit"] = self.get_memory() 237 238 # Temporary directory 239 if config.get("tmp", None): 240 connexion_config["temp_directory"] = config.get("tmp") 241 242 # Access 243 if config.get("access", None): 244 access = config.get("access") 245 if access in ["RO"]: 246 access = "READ_ONLY" 247 elif access in ["RW"]: 248 access = "READ_WRITE" 249 connexion_db = self.get_connexion_db() 250 if connexion_db in ":memory:": 251 access = "READ_WRITE" 252 connexion_config["access_mode"] = access 253 254 return connexion_config
The function get_connexion_config returns a dictionary containing the configuration for a
connection, including the number of threads and memory limit.
Returns
a dictionary containing the configuration for the Connexion library.
256 def get_duckdb_settings(self) -> dict: 257 """ 258 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 259 string. 260 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 261 """ 262 263 # config 264 config = self.get_config() 265 266 # duckdb settings 267 duckdb_settings_dict = {} 268 if config.get("duckdb_settings", None): 269 duckdb_settings = config.get("duckdb_settings") 270 duckdb_settings = full_path(duckdb_settings) 271 # duckdb setting is a file 272 if os.path.exists(duckdb_settings): 273 with open(duckdb_settings) as json_file: 274 duckdb_settings_dict = yaml.safe_load(json_file) 275 # duckdb settings is a string 276 else: 277 duckdb_settings_dict = json.loads(duckdb_settings) 278 279 return duckdb_settings_dict
The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a
string.
Returns
The function
get_duckdb_settingsreturns a dictionary objectduckdb_settings_dict.
281 def set_connexion_db(self) -> str: 282 """ 283 The function `set_connexion_db` returns the appropriate database connection string based on the 284 input format and connection type. 285 :return: the value of the variable `connexion_db`. 286 """ 287 288 # Default connexion db 289 default_connexion_db = ":memory:" 290 291 # Find connexion db 292 if self.get_input_format() in ["db", "duckdb"]: 293 connexion_db = self.get_input() 294 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 295 connexion_db = default_connexion_db 296 elif self.get_connexion_type() in ["tmpfile"]: 297 tmp_name = tempfile.mkdtemp( 298 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 299 ) 300 connexion_db = f"{tmp_name}/tmp.db" 301 elif self.get_connexion_type() != "": 302 connexion_db = self.get_connexion_type() 303 else: 304 connexion_db = default_connexion_db 305 306 # Set connexion db 307 self.connexion_db = connexion_db 308 309 return connexion_db
The function set_connexion_db returns the appropriate database connection string based on the
input format and connection type.
Returns
the value of the variable
connexion_db.
311 def set_connexion(self, conn) -> None: 312 """ 313 The function `set_connexion` creates a connection to a database, with options for different 314 database formats and settings. 315 316 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 317 database. If a connection is not provided, a new connection to an in-memory database is created. 318 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 319 sqlite 320 """ 321 322 # Connexion db 323 connexion_db = self.set_connexion_db() 324 325 # Connexion config 326 connexion_config = self.get_connexion_config() 327 328 # Connexion format 329 connexion_format = self.get_config().get("connexion_format", "duckdb") 330 # Set connexion format 331 self.connexion_format = connexion_format 332 333 # Connexion 334 if not conn: 335 if connexion_format in ["duckdb"]: 336 conn = duckdb.connect(connexion_db, config=connexion_config) 337 # duckDB settings 338 duckdb_settings = self.get_duckdb_settings() 339 if duckdb_settings: 340 for setting in duckdb_settings: 341 setting_value = duckdb_settings.get(setting) 342 if isinstance(setting_value, str): 343 setting_value = f"'{setting_value}'" 344 conn.execute(f"PRAGMA {setting}={setting_value};") 345 elif connexion_format in ["sqlite"]: 346 conn = sqlite3.connect(connexion_db) 347 348 # Set connexion 349 self.conn = conn 350 351 # Log 352 log.debug(f"connexion_format: {connexion_format}") 353 log.debug(f"connexion_db: {connexion_db}") 354 log.debug(f"connexion config: {connexion_config}") 355 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
The function set_connexion creates a connection to a database, with options for different
database formats and settings.
Parameters
- conn: The
connparameter in theset_connexionmethod is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
357 def set_output(self, output: str = None) -> None: 358 """ 359 The `set_output` function in Python sets the output file based on the input or a specified key 360 in the config file, extracting the output name, extension, and format. 361 362 :param output: The `output` parameter in the `set_output` method is used to specify the name of 363 the output file. If the config file has an 'output' key, the method sets the output to the value 364 of that key. If no output is provided, it sets the output to `None` 365 :type output: str 366 """ 367 368 if output and not isinstance(output, str): 369 self.output = output.name 370 else: 371 self.output = output 372 373 # Output format 374 if self.output: 375 output_name, output_extension = os.path.splitext(self.output) 376 self.output_name = output_name 377 self.output_extension = output_extension 378 self.output_format = self.output_extension.replace(".", "") 379 else: 380 self.output_name = None 381 self.output_extension = None 382 self.output_format = None
The set_output function in Python sets the output file based on the input or a specified key
in the config file, extracting the output name, extension, and format.
Parameters
- output: The
outputparameter in theset_outputmethod is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output toNone
384 def set_header(self) -> None: 385 """ 386 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 387 """ 388 389 input_file = self.get_input() 390 default_header_list = [ 391 "##fileformat=VCFv4.2", 392 "#CHROM POS ID REF ALT QUAL FILTER INFO", 393 ] 394 395 # Full path 396 input_file = full_path(input_file) 397 398 if input_file: 399 400 input_format = self.get_input_format() 401 input_compressed = self.get_input_compressed() 402 config = self.get_config() 403 header_list = default_header_list 404 if input_format in [ 405 "vcf", 406 "hdr", 407 "tsv", 408 "csv", 409 "psv", 410 "parquet", 411 "db", 412 "duckdb", 413 ]: 414 # header provided in param 415 if config.get("header_file", None): 416 with open(config.get("header_file"), "rt") as f: 417 header_list = self.read_vcf_header(f) 418 # within a vcf file format (header within input file itsself) 419 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 420 # within a compressed vcf file format (.vcf.gz) 421 if input_compressed: 422 with bgzf.open(input_file, "rt") as f: 423 header_list = self.read_vcf_header(f) 424 # within an uncompressed vcf file format (.vcf) 425 else: 426 with open(input_file, "rt") as f: 427 header_list = self.read_vcf_header(f) 428 # header provided in default external file .hdr 429 elif os.path.exists((input_file + ".hdr")): 430 with open(input_file + ".hdr", "rt") as f: 431 header_list = self.read_vcf_header(f) 432 else: 433 try: # Try to get header info fields and file columns 434 435 with tempfile.TemporaryDirectory() as tmpdir: 436 437 # Create database 438 db_for_header = Database(database=input_file) 439 440 # Get header columns for infos fields 441 db_header_from_columns = ( 442 db_for_header.get_header_from_columns() 443 ) 444 445 # Get real columns in the file 446 db_header_columns = db_for_header.get_columns() 447 448 # Write header file 449 header_file_tmp = os.path.join(tmpdir, "header") 450 f = open(header_file_tmp, "w") 451 vcf.Writer(f, db_header_from_columns) 452 f.close() 453 454 # Replace #CHROM line with rel columns 455 header_list = db_for_header.read_header_file( 456 header_file=header_file_tmp 457 ) 458 header_list[-1] = "\t".join(db_header_columns) 459 460 except: 461 462 log.warning( 463 f"No header for file {input_file}. Set as default VCF header" 464 ) 465 header_list = default_header_list 466 467 else: # try for unknown format ? 468 469 log.error(f"Input file format '{input_format}' not available") 470 raise ValueError(f"Input file format '{input_format}' not available") 471 472 if not header_list: 473 header_list = default_header_list 474 475 # header as list 476 self.header_list = header_list 477 478 # header as VCF object 479 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 480 481 else: 482 483 self.header_list = None 484 self.header_vcf = None
It reads the header of a VCF file and stores it as a list of strings and as a VCF object
486 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 487 """ 488 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 489 DataFrame based on the connection format. 490 491 :param query: The `query` parameter in the `get_query_to_df` function is a string that 492 represents the SQL query you want to execute. This query will be used to fetch data from a 493 database and convert it into a pandas DataFrame 494 :type query: str 495 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 496 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 497 function will only fetch up to that number of rows from the database query result. If no limit 498 is specified, 499 :type limit: int 500 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 501 """ 502 503 # Connexion format 504 connexion_format = self.get_connexion_format() 505 506 # Limit in query 507 if limit: 508 pd.set_option("display.max_rows", limit) 509 if connexion_format in ["duckdb"]: 510 df = ( 511 self.conn.execute(query) 512 .fetch_record_batch(limit) 513 .read_next_batch() 514 .to_pandas() 515 ) 516 elif connexion_format in ["sqlite"]: 517 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 518 519 # Full query 520 else: 521 if connexion_format in ["duckdb"]: 522 df = self.conn.execute(query).df() 523 elif connexion_format in ["sqlite"]: 524 df = pd.read_sql_query(query, self.conn) 525 526 return df
The get_query_to_df function takes a query as a string and returns the result as a pandas
DataFrame based on the connection format.
Parameters
- query: The
queryparameter in theget_query_to_dffunction is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame - limit: The
limitparameter in theget_query_to_dffunction is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns
A pandas DataFrame is being returned by the
get_query_to_dffunction.
528 def get_overview(self) -> None: 529 """ 530 The function prints the input, output, config, and dataframe of the current object 531 """ 532 table_variants_from = self.get_table_variants(clause="from") 533 sql_columns = self.get_header_columns_as_sql() 534 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 535 df = self.get_query_to_df(sql_query_export) 536 log.info( 537 "Input: " 538 + str(self.get_input()) 539 + " [" 540 + str(str(self.get_input_format())) 541 + "]" 542 ) 543 log.info( 544 "Output: " 545 + str(self.get_output()) 546 + " [" 547 + str(str(self.get_output_format())) 548 + "]" 549 ) 550 log.info("Config: ") 551 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 552 "\n" 553 ): 554 log.info("\t" + str(d)) 555 log.info("Param: ") 556 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 557 "\n" 558 ): 559 log.info("\t" + str(d)) 560 log.info("Sample list: " + str(self.get_header_sample_list())) 561 log.info("Dataframe: ") 562 for d in str(df).split("\n"): 563 log.info("\t" + str(d)) 564 565 # garbage collector 566 del df 567 gc.collect() 568 569 return None
The function prints the input, output, config, and dataframe of the current object
571 def get_stats(self) -> dict: 572 """ 573 The `get_stats` function calculates and returns various statistics of the current object, 574 including information about the input file, variants, samples, header fields, quality, and 575 SNVs/InDels. 576 :return: a dictionary containing various statistics of the current object. The dictionary has 577 the following structure: 578 """ 579 580 # Log 581 log.info(f"Stats Calculation...") 582 583 # table varaints 584 table_variants_from = self.get_table_variants() 585 586 # stats dict 587 stats = {"Infos": {}} 588 589 ### File 590 input_file = self.get_input() 591 stats["Infos"]["Input file"] = input_file 592 593 # Header 594 header_infos = self.get_header().infos 595 header_formats = self.get_header().formats 596 header_infos_list = list(header_infos) 597 header_formats_list = list(header_formats) 598 599 ### Variants 600 601 stats["Variants"] = {} 602 603 # Variants by chr 604 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 605 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 606 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 607 by=["CHROM"], kind="quicksort" 608 ) 609 610 # Total number of variants 611 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 612 613 # Calculate percentage 614 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 615 lambda x: (x / nb_of_variants) 616 ) 617 618 stats["Variants"]["Number of variants by chromosome"] = ( 619 nb_of_variants_by_chrom.to_dict(orient="index") 620 ) 621 622 stats["Infos"]["Number of variants"] = int(nb_of_variants) 623 624 ### Samples 625 626 # Init 627 samples = {} 628 nb_of_samples = 0 629 630 # Check Samples 631 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 632 log.debug(f"Check samples...") 633 for sample in self.get_header_sample_list(): 634 sql_query_samples = f""" 635 SELECT '{sample}' as sample, 636 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 637 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 638 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 639 FROM {table_variants_from} 640 WHERE ( 641 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 642 AND 643 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 644 ) 645 GROUP BY genotype 646 """ 647 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 648 sample_genotype_count = sql_query_genotype_df["count"].sum() 649 if len(sql_query_genotype_df): 650 nb_of_samples += 1 651 samples[f"{sample} - {sample_genotype_count} variants"] = ( 652 sql_query_genotype_df.to_dict(orient="index") 653 ) 654 655 stats["Samples"] = samples 656 stats["Infos"]["Number of samples"] = nb_of_samples 657 658 # # 659 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 660 # stats["Infos"]["Number of samples"] = nb_of_samples 661 # elif nb_of_samples: 662 # stats["Infos"]["Number of samples"] = "not a VCF format" 663 664 ### INFO and FORMAT fields 665 header_types_df = {} 666 header_types_list = { 667 "List of INFO fields": header_infos, 668 "List of FORMAT fields": header_formats, 669 } 670 i = 0 671 for header_type in header_types_list: 672 673 header_type_infos = header_types_list.get(header_type) 674 header_infos_dict = {} 675 676 for info in header_type_infos: 677 678 i += 1 679 header_infos_dict[i] = {} 680 681 # ID 682 header_infos_dict[i]["id"] = info 683 684 # num 685 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 686 if header_type_infos[info].num in genotype_map.keys(): 687 header_infos_dict[i]["Number"] = genotype_map.get( 688 header_type_infos[info].num 689 ) 690 else: 691 header_infos_dict[i]["Number"] = header_type_infos[info].num 692 693 # type 694 if header_type_infos[info].type: 695 header_infos_dict[i]["Type"] = header_type_infos[info].type 696 else: 697 header_infos_dict[i]["Type"] = "." 698 699 # desc 700 if header_type_infos[info].desc != None: 701 header_infos_dict[i]["Description"] = header_type_infos[info].desc 702 else: 703 header_infos_dict[i]["Description"] = "" 704 705 if len(header_infos_dict): 706 header_types_df[header_type] = pd.DataFrame.from_dict( 707 header_infos_dict, orient="index" 708 ).to_dict(orient="index") 709 710 # Stats 711 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 712 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 713 stats["Header"] = header_types_df 714 715 ### QUAL 716 if "QUAL" in self.get_header_columns(): 717 sql_query_qual = f""" 718 SELECT 719 avg(CAST(QUAL AS INTEGER)) AS Average, 720 min(CAST(QUAL AS INTEGER)) AS Minimum, 721 max(CAST(QUAL AS INTEGER)) AS Maximum, 722 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 723 median(CAST(QUAL AS INTEGER)) AS Median, 724 variance(CAST(QUAL AS INTEGER)) AS Variance 725 FROM {table_variants_from} 726 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 727 """ 728 729 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 730 stats["Quality"] = {"Stats": qual} 731 732 ### SNV and InDel 733 734 sql_query_snv = f""" 735 736 SELECT Type, count FROM ( 737 738 SELECT 739 'Total' AS Type, 740 count(*) AS count 741 FROM {table_variants_from} 742 743 UNION 744 745 SELECT 746 'MNV' AS Type, 747 count(*) AS count 748 FROM {table_variants_from} 749 WHERE len(REF) > 1 AND len(ALT) > 1 750 AND len(REF) = len(ALT) 751 752 UNION 753 754 SELECT 755 'InDel' AS Type, 756 count(*) AS count 757 FROM {table_variants_from} 758 WHERE len(REF) > 1 OR len(ALT) > 1 759 AND len(REF) != len(ALT) 760 761 UNION 762 763 SELECT 764 'SNV' AS Type, 765 count(*) AS count 766 FROM {table_variants_from} 767 WHERE len(REF) = 1 AND len(ALT) = 1 768 769 ) 770 771 ORDER BY count DESC 772 773 """ 774 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 775 776 sql_query_snv_substitution = f""" 777 SELECT 778 concat(REF, '>', ALT) AS 'Substitution', 779 count(*) AS count 780 FROM {table_variants_from} 781 WHERE len(REF) = 1 AND len(ALT) = 1 782 GROUP BY REF, ALT 783 ORDER BY count(*) DESC 784 """ 785 snv_substitution = ( 786 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 787 ) 788 stats["Variants"]["Counts"] = snv_indel 789 stats["Variants"]["Substitutions"] = snv_substitution 790 791 return stats
The get_stats function calculates and returns various statistics of the current object,
including information about the input file, variants, samples, header fields, quality, and
SNVs/InDels.
Returns
a dictionary containing various statistics of the current object. The dictionary has the following structure:
793 def stats_to_file(self, file: str = None) -> str: 794 """ 795 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 796 into a JSON object, and writes the JSON object to the specified file. 797 798 :param file: The `file` parameter is a string that represents the file path where the JSON data 799 will be written 800 :type file: str 801 :return: the name of the file that was written to. 802 """ 803 804 # Get stats 805 stats = self.get_stats() 806 807 # Serializing json 808 json_object = json.dumps(stats, indent=4) 809 810 # Writing to sample.json 811 with open(file, "w") as outfile: 812 outfile.write(json_object) 813 814 return file
The function stats_to_file takes a file name as input, retrieves statistics, serializes them
into a JSON object, and writes the JSON object to the specified file.
Parameters
- file: The
fileparameter is a string that represents the file path where the JSON data will be written
Returns
the name of the file that was written to.
816 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 817 """ 818 The `print_stats` function generates a markdown file and prints the statistics contained in a 819 JSON file in a formatted manner. 820 821 :param output_file: The `output_file` parameter is a string that specifies the path and filename 822 of the output file where the stats will be printed in Markdown format. If no `output_file` is 823 provided, a temporary directory will be created and the stats will be saved in a file named 824 "stats.md" within that 825 :type output_file: str 826 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 827 file where the statistics will be saved. If no value is provided, a temporary directory will be 828 created and a default file name "stats.json" will be used 829 :type json_file: str 830 :return: The function `print_stats` does not return any value. It has a return type annotation 831 of `None`. 832 """ 833 834 # Full path 835 output_file = full_path(output_file) 836 json_file = full_path(json_file) 837 838 with tempfile.TemporaryDirectory() as tmpdir: 839 840 # Files 841 if not output_file: 842 output_file = os.path.join(tmpdir, "stats.md") 843 if not json_file: 844 json_file = os.path.join(tmpdir, "stats.json") 845 846 # Create folders 847 if not os.path.exists(os.path.dirname(output_file)): 848 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 849 if not os.path.exists(os.path.dirname(json_file)): 850 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 851 852 # Create stats JSON file 853 stats_file = self.stats_to_file(file=json_file) 854 855 # Print stats file 856 with open(stats_file) as f: 857 stats = yaml.safe_load(f) 858 859 # Output 860 output_title = [] 861 output_index = [] 862 output = [] 863 864 # Title 865 output_title.append("# HOWARD Stats") 866 867 # Index 868 output_index.append("## Index") 869 870 # Process sections 871 for section in stats: 872 infos = stats.get(section) 873 section_link = "#" + section.lower().replace(" ", "-") 874 output.append(f"## {section}") 875 output_index.append(f"- [{section}]({section_link})") 876 877 if len(infos): 878 for info in infos: 879 try: 880 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 881 is_df = True 882 except: 883 try: 884 df = pd.DataFrame.from_dict( 885 json.loads((infos.get(info))), orient="index" 886 ) 887 is_df = True 888 except: 889 is_df = False 890 if is_df: 891 output.append(f"### {info}") 892 info_link = "#" + info.lower().replace(" ", "-") 893 output_index.append(f" - [{info}]({info_link})") 894 output.append(f"{df.to_markdown(index=False)}") 895 else: 896 output.append(f"- {info}: {infos.get(info)}") 897 else: 898 output.append(f"NA") 899 900 # Write stats in markdown file 901 with open(output_file, "w") as fp: 902 for item in output_title: 903 fp.write("%s\n" % item) 904 for item in output_index: 905 fp.write("%s\n" % item) 906 for item in output: 907 fp.write("%s\n" % item) 908 909 # Output stats in markdown 910 print("") 911 print("\n\n".join(output_title)) 912 print("") 913 print("\n\n".join(output)) 914 print("") 915 916 return None
The print_stats function generates a markdown file and prints the statistics contained in a
JSON file in a formatted manner.
Parameters
- output_file: The
output_fileparameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If nooutput_fileis provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that - json_file: The
json_fileparameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns
The function
print_statsdoes not return any value. It has a return type annotation ofNone.
918 def get_input(self) -> str: 919 """ 920 It returns the value of the input variable. 921 :return: The input is being returned. 922 """ 923 return self.input
It returns the value of the input variable.
Returns
The input is being returned.
925 def get_input_format(self, input_file: str = None) -> str: 926 """ 927 This function returns the format of the input variable, either from the provided input file or 928 by prompting for input. 929 930 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 931 represents the file path of the input file. If no `input_file` is provided when calling the 932 method, it will default to `None` 933 :type input_file: str 934 :return: The format of the input variable is being returned. 935 """ 936 937 if not input_file: 938 input_file = self.get_input() 939 input_format = get_file_format(input_file) 940 return input_format
This function returns the format of the input variable, either from the provided input file or by prompting for input.
Parameters
- input_file: The
input_fileparameter in theget_input_formatmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNone
Returns
The format of the input variable is being returned.
942 def get_input_compressed(self, input_file: str = None) -> str: 943 """ 944 The function `get_input_compressed` returns the format of the input variable after compressing 945 it. 946 947 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 948 that represents the file path of the input file. If no `input_file` is provided when calling the 949 method, it will default to `None` and the method will then call `self.get_input()` to 950 :type input_file: str 951 :return: The function `get_input_compressed` returns the compressed format of the input 952 variable. 953 """ 954 955 if not input_file: 956 input_file = self.get_input() 957 input_compressed = get_file_compressed(input_file) 958 return input_compressed
The function get_input_compressed returns the format of the input variable after compressing
it.
Parameters
- input_file: The
input_fileparameter in theget_input_compressedmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNoneand the method will then callself.get_input()to
Returns
The function
get_input_compressedreturns the compressed format of the input variable.
960 def get_output(self) -> str: 961 """ 962 It returns the output of the neuron. 963 :return: The output of the neural network. 964 """ 965 966 return self.output
It returns the output of the neuron.
Returns
The output of the neural network.
968 def get_output_format(self, output_file: str = None) -> str: 969 """ 970 The function `get_output_format` returns the format of the input variable or the output file if 971 provided. 972 973 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 974 that represents the file path of the output file. If no `output_file` is provided when calling 975 the method, it will default to the output obtained from the `get_output` method of the class 976 instance. The 977 :type output_file: str 978 :return: The format of the input variable is being returned. 979 """ 980 981 if not output_file: 982 output_file = self.get_output() 983 output_format = get_file_format(output_file) 984 985 return output_format
The function get_output_format returns the format of the input variable or the output file if
provided.
Parameters
- output_file: The
output_fileparameter in theget_output_formatmethod is a string that represents the file path of the output file. If nooutput_fileis provided when calling the method, it will default to the output obtained from theget_outputmethod of the class instance. The
Returns
The format of the input variable is being returned.
987 def get_config(self) -> dict: 988 """ 989 It returns the config 990 :return: The config variable is being returned. 991 """ 992 return self.config
It returns the config
Returns
The config variable is being returned.
994 def get_param(self) -> dict: 995 """ 996 It returns the param 997 :return: The param variable is being returned. 998 """ 999 return self.param
It returns the param
Returns
The param variable is being returned.
1001 def get_connexion_db(self) -> str: 1002 """ 1003 It returns the connexion_db attribute of the object 1004 :return: The connexion_db is being returned. 1005 """ 1006 return self.connexion_db
It returns the connexion_db attribute of the object
Returns
The connexion_db is being returned.
1008 def get_prefix(self) -> str: 1009 """ 1010 It returns the prefix of the object. 1011 :return: The prefix is being returned. 1012 """ 1013 return self.prefix
It returns the prefix of the object.
Returns
The prefix is being returned.
1015 def get_table_variants(self, clause: str = "select") -> str: 1016 """ 1017 This function returns the table_variants attribute of the object 1018 1019 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1020 defaults to select (optional) 1021 :return: The table_variants attribute of the object. 1022 """ 1023 1024 # Access 1025 access = self.get_config().get("access", None) 1026 1027 # Clauses "select", "where", "update" 1028 if clause in ["select", "where", "update"]: 1029 table_variants = self.table_variants 1030 # Clause "from" 1031 elif clause in ["from"]: 1032 # For Read Only 1033 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1034 input_file = self.get_input() 1035 table_variants = f"'{input_file}' as variants" 1036 # For Read Write 1037 else: 1038 table_variants = f"{self.table_variants} as variants" 1039 else: 1040 table_variants = self.table_variants 1041 return table_variants
This function returns the table_variants attribute of the object
Parameters
- clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns
The table_variants attribute of the object.
1043 def get_tmp_dir(self) -> str: 1044 """ 1045 The function `get_tmp_dir` returns the temporary directory path based on configuration 1046 parameters or a default path. 1047 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1048 configuration, parameters, and a default value of "/tmp". 1049 """ 1050 1051 return get_tmp( 1052 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1053 )
The function get_tmp_dir returns the temporary directory path based on configuration
parameters or a default path.
Returns
The
get_tmp_dirmethod is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".
1055 def get_connexion_type(self) -> str: 1056 """ 1057 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1058 1059 :return: The connexion type is being returned. 1060 """ 1061 return self.get_config().get("connexion_type", "memory")
If the connexion type is not in the list of allowed connexion types, raise a ValueError
Returns
The connexion type is being returned.
1063 def get_connexion(self): 1064 """ 1065 It returns the connection object 1066 1067 :return: The connection object. 1068 """ 1069 return self.conn
It returns the connection object
Returns
The connection object.
1071 def close_connexion(self) -> None: 1072 """ 1073 This function closes the connection to the database. 1074 :return: The connection is being closed. 1075 """ 1076 return self.conn.close()
This function closes the connection to the database.
Returns
The connection is being closed.
1078 def get_header(self, type: str = "vcf"): 1079 """ 1080 This function returns the header of the VCF file as a list of strings 1081 1082 :param type: the type of header you want to get, defaults to vcf (optional) 1083 :return: The header of the vcf file. 1084 """ 1085 1086 if self.header_vcf: 1087 if type == "vcf": 1088 return self.header_vcf 1089 elif type == "list": 1090 return self.header_list 1091 else: 1092 if type == "vcf": 1093 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1094 return header 1095 elif type == "list": 1096 return vcf_required
This function returns the header of the VCF file as a list of strings
Parameters
- type: the type of header you want to get, defaults to vcf (optional)
Returns
The header of the vcf file.
1098 def get_header_infos_list(self) -> list: 1099 """ 1100 This function retrieves a list of information fields from the header. 1101 :return: A list of information fields from the header. 1102 """ 1103 1104 # Init 1105 infos_list = [] 1106 1107 for field in self.get_header().infos: 1108 infos_list.append(field) 1109 1110 return infos_list
This function retrieves a list of information fields from the header.
Returns
A list of information fields from the header.
1112 def get_header_length(self, file: str = None) -> int: 1113 """ 1114 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1115 line. 1116 1117 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1118 header file. If this argument is provided, the function will read the header from the specified 1119 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1120 :type file: str 1121 :return: the length of the header list, excluding the #CHROM line. 1122 """ 1123 1124 if file: 1125 return len(self.read_vcf_header_file(file=file)) - 1 1126 elif self.get_header(type="list"): 1127 return len(self.get_header(type="list")) - 1 1128 else: 1129 return 0
The function get_header_length returns the length of the header list, excluding the #CHROM
line.
Parameters
- file: The
fileparameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns
the length of the header list, excluding the #CHROM line.
1131 def get_header_columns(self) -> str: 1132 """ 1133 This function returns the header list of a VCF 1134 1135 :return: The length of the header list. 1136 """ 1137 if self.get_header(): 1138 return self.get_header(type="list")[-1] 1139 else: 1140 return ""
This function returns the header list of a VCF
Returns
The length of the header list.
1142 def get_header_columns_as_list(self) -> list: 1143 """ 1144 This function returns the header list of a VCF 1145 1146 :return: The length of the header list. 1147 """ 1148 if self.get_header(): 1149 return self.get_header_columns().strip().split("\t") 1150 else: 1151 return []
This function returns the header list of a VCF
Returns
The length of the header list.
1153 def get_header_columns_as_sql(self) -> str: 1154 """ 1155 This function retruns header length (without #CHROM line) 1156 1157 :return: The length of the header list. 1158 """ 1159 sql_column_list = [] 1160 for col in self.get_header_columns_as_list(): 1161 sql_column_list.append(f'"{col}"') 1162 return ",".join(sql_column_list)
This function retruns header length (without #CHROM line)
Returns
The length of the header list.
1164 def get_header_sample_list( 1165 self, check: bool = False, samples: list = None, samples_force: bool = False 1166 ) -> list: 1167 """ 1168 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1169 checking and filtering based on input parameters. 1170 1171 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1172 parameter that determines whether to check if the samples in the list are properly defined as 1173 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1174 list is defined as a, defaults to False 1175 :type check: bool (optional) 1176 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1177 allows you to specify a subset of samples from the header. If you provide a list of sample 1178 names, the function will check if each sample is defined in the header. If a sample is not found 1179 in the 1180 :type samples: list 1181 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1182 a boolean parameter that determines whether to force the function to return the sample list 1183 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1184 function will return the sample list without performing, defaults to False 1185 :type samples_force: bool (optional) 1186 :return: The function `get_header_sample_list` returns a list of samples based on the input 1187 parameters and conditions specified in the function. 1188 """ 1189 1190 # Init 1191 samples_list = [] 1192 1193 if samples is None: 1194 samples_list = self.header_vcf.samples 1195 else: 1196 samples_checked = [] 1197 for sample in samples: 1198 if sample in self.header_vcf.samples: 1199 samples_checked.append(sample) 1200 else: 1201 log.warning(f"Sample '{sample}' not defined in header") 1202 samples_list = samples_checked 1203 1204 # Force sample list without checking if is_genotype_column 1205 if samples_force: 1206 log.warning(f"Samples {samples_list} not checked if genotypes") 1207 return samples_list 1208 1209 if check: 1210 samples_checked = [] 1211 for sample in samples_list: 1212 if self.is_genotype_column(column=sample): 1213 samples_checked.append(sample) 1214 else: 1215 log.warning( 1216 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1217 ) 1218 samples_list = samples_checked 1219 1220 # Return samples list 1221 return samples_list
The function get_header_sample_list returns a list of samples from a VCF header, with optional
checking and filtering based on input parameters.
Parameters
- check: The
checkparameter in theget_header_sample_listfunction is a boolean parameter that determines whether to check if the samples in the list are properly defined as genotype columns. Ifcheckis set toTrue, the function will verify if each sample in the list is defined as a, defaults to False - samples: The
samplesparameter in theget_header_sample_listfunction is a list that allows you to specify a subset of samples from the header. If you provide a list of sample names, the function will check if each sample is defined in the header. If a sample is not found in the - samples_force: The
samples_forceparameter in theget_header_sample_listfunction is a boolean parameter that determines whether to force the function to return the sample list without checking if the samples are genotype columns. Ifsamples_forceis set toTrue, the function will return the sample list without performing, defaults to False
Returns
The function
get_header_sample_listreturns a list of samples based on the input parameters and conditions specified in the function.
1223 def is_genotype_column(self, column: str = None) -> bool: 1224 """ 1225 This function checks if a given column is a genotype column in a database. 1226 1227 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1228 represents the column name in a database table. This method checks if the specified column is a 1229 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1230 method of 1231 :type column: str 1232 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1233 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1234 column name and returns the result. If the `column` parameter is None, it returns False. 1235 """ 1236 1237 if column is not None: 1238 return Database(database=self.get_input()).is_genotype_column(column=column) 1239 else: 1240 return False
This function checks if a given column is a genotype column in a database.
Parameters
- column: The
columnparameter in theis_genotype_columnmethod is a string that represents the column name in a database table. This method checks if the specified column is a genotype column in the database. If a column name is provided, it calls theis_genotype_columnmethod of
Returns
The
is_genotype_columnmethod is returning a boolean value. If thecolumnparameter is not None, it calls theis_genotype_columnmethod of theDatabaseclass with the specified column name and returns the result. If thecolumnparameter is None, it returns False.
1242 def get_verbose(self) -> bool: 1243 """ 1244 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1245 exist 1246 1247 :return: The value of the key "verbose" in the config dictionary. 1248 """ 1249 return self.get_config().get("verbose", False)
It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist
Returns
The value of the key "verbose" in the config dictionary.
1251 def get_connexion_format(self) -> str: 1252 """ 1253 It returns the connexion format of the object. 1254 :return: The connexion_format is being returned. 1255 """ 1256 connexion_format = self.connexion_format 1257 if connexion_format not in ["duckdb", "sqlite"]: 1258 log.error(f"Unknown connexion format {connexion_format}") 1259 raise ValueError(f"Unknown connexion format {connexion_format}") 1260 else: 1261 return connexion_format
It returns the connexion format of the object.
Returns
The connexion_format is being returned.
1263 def insert_file_to_table( 1264 self, 1265 file, 1266 columns: str, 1267 header_len: int = 0, 1268 sep: str = "\t", 1269 chunksize: int = 1000000, 1270 ) -> None: 1271 """ 1272 The function reads a file in chunks and inserts each chunk into a table based on the specified 1273 database format. 1274 1275 :param file: The `file` parameter is the file that you want to load into a table. It should be 1276 the path to the file on your system 1277 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1278 should contain the names of the columns in the table where the data will be inserted. The column 1279 names should be separated by commas within the string. For example, if you have columns named 1280 "id", "name 1281 :type columns: str 1282 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1283 the number of lines to skip at the beginning of the file before reading the actual data. This 1284 parameter allows you to skip any header information present in the file before processing the 1285 data, defaults to 0 1286 :type header_len: int (optional) 1287 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1288 separator character that is used in the file being read. In this case, the default separator is 1289 set to `\t`, which represents a tab character. You can change this parameter to a different 1290 separator character if, defaults to \t 1291 :type sep: str (optional) 1292 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1293 when processing the file in chunks. In the provided code snippet, the default value for 1294 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1295 to 1000000 1296 :type chunksize: int (optional) 1297 """ 1298 1299 # Config 1300 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1301 connexion_format = self.get_connexion_format() 1302 1303 log.debug("chunksize: " + str(chunksize)) 1304 1305 if chunksize: 1306 for chunk in pd.read_csv( 1307 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1308 ): 1309 if connexion_format in ["duckdb"]: 1310 sql_insert_into = ( 1311 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1312 ) 1313 self.conn.execute(sql_insert_into) 1314 elif connexion_format in ["sqlite"]: 1315 chunk.to_sql("variants", self.conn, if_exists="append", index=False)
The function reads a file in chunks and inserts each chunk into a table based on the specified database format.
Parameters
- file: The
fileparameter is the file that you want to load into a table. It should be the path to the file on your system - columns: The
columnsparameter in theinsert_file_to_tablefunction is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name - header_len: The
header_lenparameter in theinsert_file_to_tablefunction specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0 - sep: The
sepparameter in theinsert_file_to_tablefunction is used to specify the separator character that is used in the file being read. In this case, the default separator is set to, which represents a tab character. You can change this parameter to a different separator character if, defaults to - chunksize: The
chunksizeparameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value forchunksizeis set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
1317 def load_data( 1318 self, 1319 input_file: str = None, 1320 drop_variants_table: bool = False, 1321 sample_size: int = 20480, 1322 ) -> None: 1323 """ 1324 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1325 table before loading the data and specify a sample size. 1326 1327 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1328 table 1329 :type input_file: str 1330 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1331 determines whether the variants table should be dropped before loading the data. If set to 1332 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1333 not be dropped, defaults to False 1334 :type drop_variants_table: bool (optional) 1335 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1336 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1337 20480 1338 :type sample_size: int (optional) 1339 """ 1340 1341 log.info("Loading...") 1342 1343 # change input file 1344 if input_file: 1345 self.set_input(input_file) 1346 self.set_header() 1347 1348 # drop variants table 1349 if drop_variants_table: 1350 self.drop_variants_table() 1351 1352 # get table variants 1353 table_variants = self.get_table_variants() 1354 1355 # Access 1356 access = self.get_config().get("access", None) 1357 log.debug(f"access: {access}") 1358 1359 # Input format and compress 1360 input_format = self.get_input_format() 1361 input_compressed = self.get_input_compressed() 1362 log.debug(f"input_format: {input_format}") 1363 log.debug(f"input_compressed: {input_compressed}") 1364 1365 # input_compressed_format 1366 if input_compressed: 1367 input_compressed_format = "gzip" 1368 else: 1369 input_compressed_format = "none" 1370 log.debug(f"input_compressed_format: {input_compressed_format}") 1371 1372 # Connexion format 1373 connexion_format = self.get_connexion_format() 1374 1375 # Sample size 1376 if not sample_size: 1377 sample_size = -1 1378 log.debug(f"sample_size: {sample_size}") 1379 1380 # Load data 1381 log.debug(f"Load Data from {input_format}") 1382 1383 # DuckDB connexion 1384 if connexion_format in ["duckdb"]: 1385 1386 # Database already exists 1387 if self.input_format in ["db", "duckdb"]: 1388 1389 if connexion_format in ["duckdb"]: 1390 log.debug(f"Input file format '{self.input_format}' duckDB") 1391 else: 1392 log.error( 1393 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1394 ) 1395 raise ValueError( 1396 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1397 ) 1398 1399 # Load from existing database format 1400 else: 1401 1402 try: 1403 # Create Table or View 1404 database = Database(database=self.input) 1405 sql_from = database.get_sql_from(sample_size=sample_size) 1406 1407 if access in ["RO"]: 1408 sql_load = ( 1409 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1410 ) 1411 else: 1412 sql_load = ( 1413 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1414 ) 1415 self.conn.execute(sql_load) 1416 1417 except: 1418 # Format not available 1419 log.error(f"Input file format '{self.input_format}' not available") 1420 raise ValueError( 1421 f"Input file format '{self.input_format}' not available" 1422 ) 1423 1424 # SQLite connexion 1425 elif connexion_format in ["sqlite"] and input_format in [ 1426 "vcf", 1427 "tsv", 1428 "csv", 1429 "psv", 1430 ]: 1431 1432 # Main structure 1433 structure = { 1434 "#CHROM": "VARCHAR", 1435 "POS": "INTEGER", 1436 "ID": "VARCHAR", 1437 "REF": "VARCHAR", 1438 "ALT": "VARCHAR", 1439 "QUAL": "VARCHAR", 1440 "FILTER": "VARCHAR", 1441 "INFO": "VARCHAR", 1442 } 1443 1444 # Strcuture with samples 1445 structure_complete = structure 1446 if self.get_header_sample_list(): 1447 structure["FORMAT"] = "VARCHAR" 1448 for sample in self.get_header_sample_list(): 1449 structure_complete[sample] = "VARCHAR" 1450 1451 # Columns list for create and insert 1452 sql_create_table_columns = [] 1453 sql_create_table_columns_list = [] 1454 for column in structure_complete: 1455 column_type = structure_complete[column] 1456 sql_create_table_columns.append( 1457 f'"{column}" {column_type} default NULL' 1458 ) 1459 sql_create_table_columns_list.append(f'"{column}"') 1460 1461 # Create database 1462 log.debug(f"Create Table {table_variants}") 1463 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1464 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1465 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1466 self.conn.execute(sql_create_table) 1467 1468 # chunksize define length of file chunk load file 1469 chunksize = 100000 1470 1471 # delimiter 1472 delimiter = file_format_delimiters.get(input_format, "\t") 1473 1474 # Load the input file 1475 with open(self.input, "rt") as input_file: 1476 1477 # Use the appropriate file handler based on the input format 1478 if input_compressed: 1479 input_file = bgzf.open(self.input, "rt") 1480 if input_format in ["vcf"]: 1481 header_len = self.get_header_length() 1482 else: 1483 header_len = 0 1484 1485 # Insert the file contents into a table 1486 self.insert_file_to_table( 1487 input_file, 1488 columns=sql_create_table_columns_list_sql, 1489 header_len=header_len, 1490 sep=delimiter, 1491 chunksize=chunksize, 1492 ) 1493 1494 else: 1495 log.error( 1496 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1497 ) 1498 raise ValueError( 1499 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1500 ) 1501 1502 # Explode INFOS fields into table fields 1503 if self.get_explode_infos(): 1504 self.explode_infos( 1505 prefix=self.get_explode_infos_prefix(), 1506 fields=self.get_explode_infos_fields(), 1507 force=True, 1508 ) 1509 1510 # Create index after insertion 1511 self.create_indexes()
The load_data function reads a VCF file and inserts it into a table, with options to drop the
table before loading the data and specify a sample size.
Parameters
- input_file: The path to the input file. This is the VCF file that will be loaded into the table
- drop_variants_table: The
drop_variants_tableparameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set toTrue, the variants table will be dropped. If set toFalse(default), the variants table will not be dropped, defaults to False - sample_size: The
sample_sizeparameter determines the number of rows to be sampled from the input file. If it is set toNone, the default value of 20480 will be used, defaults to 20480
1513 def get_explode_infos(self) -> bool: 1514 """ 1515 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1516 to False if it is not set. 1517 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1518 value. If the parameter is not present, it will return False. 1519 """ 1520 1521 return self.get_param().get("explode", {}).get("explode_infos", False)
The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting
to False if it is not set.
Returns
The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.
1523 def get_explode_infos_fields( 1524 self, 1525 explode_infos_fields: str = None, 1526 remove_fields_not_in_header: bool = False, 1527 ) -> list: 1528 """ 1529 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1530 the input parameter `explode_infos_fields`. 1531 1532 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1533 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1534 comma-separated list of field names to explode 1535 :type explode_infos_fields: str 1536 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1537 flag that determines whether to remove fields that are not present in the header. If it is set 1538 to `True`, any field that is not in the header will be excluded from the list of exploded 1539 information fields. If it is set to `, defaults to False 1540 :type remove_fields_not_in_header: bool (optional) 1541 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1542 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1543 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1544 Otherwise, it returns a list of exploded information fields after removing any spaces and 1545 splitting the string by commas. 1546 """ 1547 1548 # If no fields, get it in param 1549 if not explode_infos_fields: 1550 explode_infos_fields = ( 1551 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1552 ) 1553 1554 # If no fields, defined as all fields in header using keyword 1555 if not explode_infos_fields: 1556 explode_infos_fields = "*" 1557 1558 # If fields list not empty 1559 if explode_infos_fields: 1560 1561 # Input fields list 1562 if isinstance(explode_infos_fields, str): 1563 fields_input = explode_infos_fields.split(",") 1564 elif isinstance(explode_infos_fields, list): 1565 fields_input = explode_infos_fields 1566 else: 1567 fields_input = [] 1568 1569 # Fields list without * keyword 1570 fields_without_all = fields_input.copy() 1571 if "*".casefold() in (item.casefold() for item in fields_without_all): 1572 fields_without_all.remove("*") 1573 1574 # Fields in header 1575 fields_in_header = sorted(list(set(self.get_header().infos))) 1576 1577 # Construct list of fields 1578 fields_output = [] 1579 for field in fields_input: 1580 1581 # Strip field 1582 field = field.strip() 1583 1584 # format keyword * in regex 1585 if field.upper() in ["*"]: 1586 field = ".*" 1587 1588 # Find all fields with pattern 1589 r = re.compile(rf"^{field}$") 1590 fields_search = sorted(list(filter(r.match, fields_in_header))) 1591 1592 # Remove fields input from search 1593 if field in fields_search: 1594 fields_search = [field] 1595 elif fields_search != [field]: 1596 fields_search = sorted( 1597 list(set(fields_search).difference(fields_input)) 1598 ) 1599 1600 # If field is not in header (avoid not well formatted header) 1601 if not fields_search and not remove_fields_not_in_header: 1602 fields_search = [field] 1603 1604 # Add found fields 1605 for new_field in fields_search: 1606 # Add field, if not already exists, and if it is in header (if asked) 1607 if ( 1608 new_field not in fields_output 1609 and ( 1610 not remove_fields_not_in_header 1611 or new_field in fields_in_header 1612 ) 1613 and new_field not in [".*"] 1614 ): 1615 fields_output.append(new_field) 1616 1617 return fields_output 1618 1619 else: 1620 1621 return []
The get_explode_infos_fields function returns a list of exploded information fields based on
the input parameter explode_infos_fields.
Parameters
- explode_infos_fields: The
explode_infos_fieldsparameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode - remove_fields_not_in_header: The parameter
remove_fields_not_in_headeris a boolean flag that determines whether to remove fields that are not present in the header. If it is set toTrue, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns
The function
get_explode_infos_fieldsreturns a list of exploded information fields. If theexplode_infos_fieldsparameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.
1623 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1624 """ 1625 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1626 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1627 not provided. 1628 1629 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1630 prefix to be used for exploding or expanding information 1631 :type explode_infos_prefix: str 1632 :return: the value of the variable `explode_infos_prefix`. 1633 """ 1634 1635 if not explode_infos_prefix: 1636 explode_infos_prefix = ( 1637 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1638 ) 1639 1640 return explode_infos_prefix
The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or
the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is
not provided.
Parameters
- explode_infos_prefix: The parameter
explode_infos_prefixis a string that specifies a prefix to be used for exploding or expanding information
Returns
the value of the variable
explode_infos_prefix.
1642 def add_column( 1643 self, 1644 table_name, 1645 column_name, 1646 column_type, 1647 default_value=None, 1648 drop: bool = False, 1649 ) -> dict: 1650 """ 1651 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1652 doesn't already exist. 1653 1654 :param table_name: The name of the table to which you want to add a column 1655 :param column_name: The parameter "column_name" is the name of the column that you want to add 1656 to the table 1657 :param column_type: The `column_type` parameter specifies the data type of the column that you 1658 want to add to the table. It should be a string that represents the desired data type, such as 1659 "INTEGER", "TEXT", "REAL", etc 1660 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1661 default value for the newly added column. If a default value is provided, it will be assigned to 1662 the column for any existing rows that do not have a value for that column 1663 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1664 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1665 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1666 to False 1667 :type drop: bool (optional) 1668 :return: a boolean value indicating whether the column was successfully added to the table. 1669 """ 1670 1671 # added 1672 added = False 1673 dropped = False 1674 1675 # Check if the column already exists in the table 1676 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1677 columns = self.get_query_to_df(query).columns.tolist() 1678 if column_name.upper() in [c.upper() for c in columns]: 1679 log.debug( 1680 f"The {column_name} column already exists in the {table_name} table" 1681 ) 1682 if drop: 1683 self.drop_column(table_name=table_name, column_name=column_name) 1684 dropped = True 1685 else: 1686 return None 1687 else: 1688 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1689 1690 # Add column in table 1691 add_column_query = ( 1692 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1693 ) 1694 if default_value is not None: 1695 add_column_query += f" DEFAULT {default_value}" 1696 self.execute_query(add_column_query) 1697 added = not dropped 1698 log.debug( 1699 f"The {column_name} column was successfully added to the {table_name} table" 1700 ) 1701 1702 if added: 1703 added_column = { 1704 "table_name": table_name, 1705 "column_name": column_name, 1706 "column_type": column_type, 1707 "default_value": default_value, 1708 } 1709 else: 1710 added_column = None 1711 1712 return added_column
The add_column function adds a column to a SQLite or DuckDB table with a default value if it
doesn't already exist.
Parameters
- table_name: The name of the table to which you want to add a column
- column_name: The parameter "column_name" is the name of the column that you want to add to the table
- column_type: The
column_typeparameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc - default_value: The
default_valueparameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column - drop: The
dropparameter is a boolean flag that determines whether to drop the column if it already exists in the table. Ifdropis set toTrue, the function will drop the existing column before adding the new column. Ifdropis set toFalse(default),, defaults to False
Returns
a boolean value indicating whether the column was successfully added to the table.
1714 def drop_column( 1715 self, column: dict = None, table_name: str = None, column_name: str = None 1716 ) -> bool: 1717 """ 1718 The `drop_column` function drops a specified column from a given table in a database and returns 1719 True if the column was successfully dropped, and False if the column does not exist in the 1720 table. 1721 1722 :param column: The `column` parameter is a dictionary that contains information about the column 1723 you want to drop. It has two keys: 1724 :type column: dict 1725 :param table_name: The `table_name` parameter is the name of the table from which you want to 1726 drop a column 1727 :type table_name: str 1728 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1729 from the table 1730 :type column_name: str 1731 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1732 and False if the column does not exist in the table. 1733 """ 1734 1735 # Find column infos 1736 if column: 1737 if isinstance(column, dict): 1738 table_name = column.get("table_name", None) 1739 column_name = column.get("column_name", None) 1740 elif isinstance(column, str): 1741 table_name = self.get_table_variants() 1742 column_name = column 1743 else: 1744 table_name = None 1745 column_name = None 1746 1747 if not table_name and not column_name: 1748 return False 1749 1750 # Removed 1751 removed = False 1752 1753 # Check if the column already exists in the table 1754 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1755 columns = self.get_query_to_df(query).columns.tolist() 1756 if column_name in columns: 1757 log.debug(f"The {column_name} column exists in the {table_name} table") 1758 else: 1759 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1760 return False 1761 1762 # Add column in table # ALTER TABLE integers DROP k 1763 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1764 self.execute_query(add_column_query) 1765 removed = True 1766 log.debug( 1767 f"The {column_name} column was successfully dropped to the {table_name} table" 1768 ) 1769 1770 return removed
The drop_column function drops a specified column from a given table in a database and returns
True if the column was successfully dropped, and False if the column does not exist in the
table.
Parameters
- column: The
columnparameter is a dictionary that contains information about the column you want to drop. It has two keys: - table_name: The
table_nameparameter is the name of the table from which you want to drop a column - column_name: The
column_nameparameter is the name of the column that you want to drop from the table
Returns
a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.
1772 def explode_infos( 1773 self, 1774 prefix: str = None, 1775 create_index: bool = False, 1776 fields: list = None, 1777 force: bool = False, 1778 proccess_all_fields_together: bool = False, 1779 table: str = None, 1780 ) -> list: 1781 """ 1782 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1783 individual columns, returning a list of added columns. 1784 1785 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1786 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1787 `self.get_explode_infos_prefix()` as the prefix 1788 :type prefix: str 1789 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1790 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1791 `False`, indexes will not be created. The default value is `False`, defaults to False 1792 :type create_index: bool (optional) 1793 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1794 that you want to explode into individual columns. If this parameter is not provided, all INFO 1795 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1796 a list to the ` 1797 :type fields: list 1798 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1799 determines whether to drop and recreate a column if it already exists in the table. If `force` 1800 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1801 defaults to False 1802 :type force: bool (optional) 1803 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1804 flag that determines whether to process all the INFO fields together or individually. If set to 1805 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1806 be processed individually. The default value is, defaults to False 1807 :type proccess_all_fields_together: bool (optional) 1808 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1809 of the table where the exploded INFO fields will be added as individual columns. If you provide 1810 a value for the `table` parameter, the function will use that table name. If the `table` 1811 parameter is 1812 :type table: str 1813 :return: The `explode_infos` function returns a list of added columns. 1814 """ 1815 1816 # drop indexes 1817 self.drop_indexes() 1818 1819 # connexion format 1820 connexion_format = self.get_connexion_format() 1821 1822 # Access 1823 access = self.get_config().get("access", None) 1824 1825 # Added columns 1826 added_columns = [] 1827 1828 if access not in ["RO"]: 1829 1830 # prefix 1831 if prefix in [None, True] or not isinstance(prefix, str): 1832 if self.get_explode_infos_prefix() not in [None, True]: 1833 prefix = self.get_explode_infos_prefix() 1834 else: 1835 prefix = "INFO/" 1836 1837 # table variants 1838 if table is not None: 1839 table_variants = table 1840 else: 1841 table_variants = self.get_table_variants(clause="select") 1842 1843 # extra infos 1844 try: 1845 extra_infos = self.get_extra_infos() 1846 except: 1847 extra_infos = [] 1848 1849 # Header infos 1850 header_infos = self.get_header().infos 1851 1852 log.debug( 1853 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1854 ) 1855 1856 sql_info_alter_table_array = [] 1857 1858 # Info fields to check 1859 fields_list = list(header_infos) 1860 if fields: 1861 fields_list += fields 1862 fields_list = set(fields_list) 1863 1864 # If no fields 1865 if not fields: 1866 fields = [] 1867 1868 # Translate fields if patterns 1869 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1870 1871 for info in fields: 1872 1873 info_id_sql = prefix + info 1874 1875 if ( 1876 info in fields_list 1877 or prefix + info in fields_list 1878 or info in extra_infos 1879 ): 1880 1881 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1882 1883 if info in header_infos: 1884 info_type = header_infos[info].type 1885 info_num = header_infos[info].num 1886 else: 1887 info_type = "String" 1888 info_num = 0 1889 1890 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1891 if info_num != 1: 1892 type_sql = "VARCHAR" 1893 1894 # Add field 1895 added_column = self.add_column( 1896 table_name=table_variants, 1897 column_name=info_id_sql, 1898 column_type=type_sql, 1899 default_value="null", 1900 drop=force, 1901 ) 1902 1903 if added_column: 1904 added_columns.append(added_column) 1905 1906 if added_column or force: 1907 1908 # add field to index 1909 self.index_additionnal_fields.append(info_id_sql) 1910 1911 # Update field array 1912 if connexion_format in ["duckdb"]: 1913 update_info_field = f""" 1914 "{info_id_sql}" = 1915 CASE 1916 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1917 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1918 END 1919 """ 1920 elif connexion_format in ["sqlite"]: 1921 update_info_field = f""" 1922 "{info_id_sql}" = 1923 CASE 1924 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1925 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1926 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1927 END 1928 """ 1929 1930 sql_info_alter_table_array.append(update_info_field) 1931 1932 if sql_info_alter_table_array: 1933 1934 # By chromosomes 1935 try: 1936 chromosomes_list = list( 1937 self.get_query_to_df( 1938 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1939 )["#CHROM"] 1940 ) 1941 except: 1942 chromosomes_list = [None] 1943 1944 for chrom in chromosomes_list: 1945 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1946 1947 # Where clause 1948 where_clause = "" 1949 if chrom and len(chromosomes_list) > 1: 1950 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1951 1952 # Update table 1953 if proccess_all_fields_together: 1954 sql_info_alter_table_array_join = ", ".join( 1955 sql_info_alter_table_array 1956 ) 1957 if sql_info_alter_table_array_join: 1958 sql_info_alter_table = f""" 1959 UPDATE {table_variants} 1960 SET {sql_info_alter_table_array_join} 1961 {where_clause} 1962 """ 1963 log.debug( 1964 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1965 ) 1966 # log.debug(sql_info_alter_table) 1967 self.conn.execute(sql_info_alter_table) 1968 else: 1969 sql_info_alter_num = 0 1970 for sql_info_alter in sql_info_alter_table_array: 1971 sql_info_alter_num += 1 1972 sql_info_alter_table = f""" 1973 UPDATE {table_variants} 1974 SET {sql_info_alter} 1975 {where_clause} 1976 """ 1977 log.debug( 1978 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1979 ) 1980 # log.debug(sql_info_alter_table) 1981 self.conn.execute(sql_info_alter_table) 1982 1983 # create indexes 1984 if create_index: 1985 self.create_indexes() 1986 1987 return added_columns
The explode_infos function in Python takes a VCF file and explodes the INFO fields into
individual columns, returning a list of added columns.
Parameters
- prefix: The
prefixparameter is a string that is used as a prefix for the exploded INFO fields. If theprefixis not provided or is set toNone, the function will use the value ofself.get_explode_infos_prefix()as the prefix - create_index: The
create_indexparameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set toTrue, indexes will be created; if set toFalse, indexes will not be created. The default value isFalse, defaults to False - fields: The
fieldsparameter in theexplode_infosfunction is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the ` - force: The
forceparameter in theexplode_infosfunction is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. Ifforceis set toTrue, the column will be dropped and recreated. Ifforceis set to `False, defaults to False - proccess_all_fields_together: The
proccess_all_fields_togetherparameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set toTrue, all the INFO fields will be processed together. If set toFalse, each INFO field will be processed individually. The default value is, defaults to False - table: The
tableparameter in theexplode_infosfunction is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for thetableparameter, the function will use that table name. If thetableparameter is
Returns
The
explode_infosfunction returns a list of added columns.
1989 def create_indexes(self) -> None: 1990 """ 1991 Create indexes on the table after insertion 1992 """ 1993 1994 # Access 1995 access = self.get_config().get("access", None) 1996 1997 # get table variants 1998 table_variants = self.get_table_variants("FROM") 1999 2000 if self.get_indexing() and access not in ["RO"]: 2001 # Create index 2002 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 2003 self.conn.execute(sql_create_table_index) 2004 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 2005 self.conn.execute(sql_create_table_index) 2006 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 2007 self.conn.execute(sql_create_table_index) 2008 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 2009 self.conn.execute(sql_create_table_index) 2010 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 2011 self.conn.execute(sql_create_table_index) 2012 for field in self.index_additionnal_fields: 2013 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 2014 self.conn.execute(sql_create_table_index)
Create indexes on the table after insertion
2016 def drop_indexes(self) -> None: 2017 """ 2018 Create indexes on the table after insertion 2019 """ 2020 2021 # Access 2022 access = self.get_config().get("access", None) 2023 2024 # get table variants 2025 table_variants = self.get_table_variants("FROM") 2026 2027 # Get database format 2028 connexion_format = self.get_connexion_format() 2029 2030 if access not in ["RO"]: 2031 if connexion_format in ["duckdb"]: 2032 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2033 elif connexion_format in ["sqlite"]: 2034 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2035 2036 list_indexes = self.conn.execute(sql_list_indexes) 2037 index_names = [row[0] for row in list_indexes.fetchall()] 2038 for index in index_names: 2039 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2040 self.conn.execute(sql_drop_table_index)
Create indexes on the table after insertion
2042 def read_vcf_header(self, f) -> list: 2043 """ 2044 It reads the header of a VCF file and returns a list of the header lines 2045 2046 :param f: the file object 2047 :return: The header lines of the VCF file. 2048 """ 2049 2050 header_list = [] 2051 for line in f: 2052 header_list.append(line) 2053 if line.startswith("#CHROM"): 2054 break 2055 return header_list
It reads the header of a VCF file and returns a list of the header lines
Parameters
- f: the file object
Returns
The header lines of the VCF file.
2057 def read_vcf_header_file(self, file: str = None) -> list: 2058 """ 2059 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2060 uncompressed files. 2061 2062 :param file: The `file` parameter is a string that represents the path to the VCF header file 2063 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2064 default to `None` 2065 :type file: str 2066 :return: The function `read_vcf_header_file` returns a list. 2067 """ 2068 2069 if self.get_input_compressed(input_file=file): 2070 with bgzf.open(file, "rt") as f: 2071 return self.read_vcf_header(f=f) 2072 else: 2073 with open(file, "rt") as f: 2074 return self.read_vcf_header(f=f)
The read_vcf_header_file function reads the header of a VCF file, handling both compressed and
uncompressed files.
Parameters
- file: The
fileparameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default toNone
Returns
The function
read_vcf_header_filereturns a list.
2076 def execute_query(self, query: str): 2077 """ 2078 It takes a query as an argument, executes it, and returns the results 2079 2080 :param query: The query to be executed 2081 :return: The result of the query is being returned. 2082 """ 2083 if query: 2084 return self.conn.execute(query) # .fetchall() 2085 else: 2086 return None
It takes a query as an argument, executes it, and returns the results
Parameters
- query: The query to be executed
Returns
The result of the query is being returned.
2088 def export_output( 2089 self, 2090 output_file: str | None = None, 2091 output_header: str | None = None, 2092 export_header: bool = True, 2093 query: str | None = None, 2094 parquet_partitions: list | None = None, 2095 chunk_size: int | None = None, 2096 threads: int | None = None, 2097 sort: bool = False, 2098 index: bool = False, 2099 order_by: str | None = None, 2100 fields_to_rename: dict | None = None, 2101 ) -> bool: 2102 """ 2103 The `export_output` function exports data from a VCF file to various formats, including VCF, 2104 CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and 2105 partitioning. 2106 2107 :param output_file: The `output_file` parameter is a string that specifies the name of the 2108 output file where the exported data will be saved 2109 :type output_file: str | None 2110 :param output_header: The `output_header` parameter is a string that specifies the name of the 2111 file where the header of the VCF file will be exported. If this parameter is not provided, the 2112 header will be exported to a file with the same name as the `output_file` parameter, but with 2113 the extension " 2114 :type output_header: str | None 2115 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2116 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2117 True, the header will be exported to a file. If `export_header` is False, the header will not 2118 be, defaults to True 2119 :type export_header: bool (optional) 2120 :param query: The `query` parameter in the `export_output` function is an optional SQL query 2121 that can be used to filter and select specific data from the VCF file before exporting it. If 2122 provided, only the data that matches the query will be exported. This allows you to customize 2123 the exported data based on 2124 :type query: str | None 2125 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2126 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2127 organize data in a hierarchical directory structure based on the values of one or more columns. 2128 This can improve query performance when working with large datasets 2129 :type parquet_partitions: list | None 2130 :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when 2131 exporting data in Parquet format. This parameter is used for partitioning the Parquet file into 2132 multiple files. It helps in optimizing the export process by breaking down the data into 2133 manageable chunks for processing and storage 2134 :type chunk_size: int | None 2135 :param threads: The `threads` parameter in the `export_output` function specifies the number of 2136 threads to be used during the export process. It determines the level of parallelism and can 2137 improve the performance of the export operation. If this parameter is not provided, the function 2138 will use the default number of threads 2139 :type threads: int | None 2140 :param sort: The `sort` parameter in the `export_output` function is a boolean flag that 2141 determines whether the output file should be sorted based on genomic coordinates of the 2142 variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to 2143 `False`,, defaults to False 2144 :type sort: bool (optional) 2145 :param index: The `index` parameter in the `export_output` function is a boolean flag that 2146 determines whether an index should be created on the output file. If `index` is set to `True`, 2147 an index will be created on the output file. If `index` is set to `False`, no, defaults to False 2148 :type index: bool (optional) 2149 :param order_by: The `order_by` parameter in the `export_output` function is a string that 2150 specifies the column(s) to use for sorting the output file. This parameter is only applicable 2151 when exporting data in VCF format. It allows you to specify the column(s) based on which the 2152 output file should be 2153 :type order_by: str | None 2154 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the 2155 mapping of field names to be renamed during the export process. This parameter allows you to 2156 customize the output field names before exporting the data. Each key-value pair in the 2157 dictionary represents the original field name as the key and the new field name 2158 :type fields_to_rename: dict | None 2159 :return: The `export_output` function returns a boolean value. It checks if the output file 2160 exists and returns True if it does, or None if it doesn't. 2161 """ 2162 2163 # Log 2164 log.info("Exporting...") 2165 2166 # Full path 2167 output_file = full_path(output_file) 2168 output_header = full_path(output_header) 2169 2170 # Config 2171 config = self.get_config() 2172 2173 # Param 2174 param = self.get_param() 2175 2176 # Tmp files to remove 2177 tmp_to_remove = [] 2178 2179 # If no output, get it 2180 if not output_file: 2181 output_file = self.get_output() 2182 2183 # If not threads 2184 if not threads: 2185 threads = self.get_threads() 2186 2187 # Rename fields 2188 if not fields_to_rename: 2189 fields_to_rename = param.get("export", {}).get("fields_to_rename", None) 2190 self.rename_info_fields(fields_to_rename=fields_to_rename) 2191 2192 # Auto header name with extension 2193 if export_header or output_header: 2194 if not output_header: 2195 output_header = f"{output_file}.hdr" 2196 # Export header 2197 self.export_header(output_file=output_file) 2198 2199 # Switch off export header if VCF output 2200 output_file_type = get_file_format(output_file) 2201 if output_file_type in ["vcf"]: 2202 export_header = False 2203 tmp_to_remove.append(output_header) 2204 2205 # Chunk size 2206 if not chunk_size: 2207 chunk_size = config.get("chunk_size", None) 2208 2209 # Parquet partition 2210 if not parquet_partitions: 2211 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2212 if parquet_partitions and isinstance(parquet_partitions, str): 2213 parquet_partitions = parquet_partitions.split(",") 2214 2215 # Order by 2216 if not order_by: 2217 order_by = param.get("export", {}).get("order_by", "") 2218 2219 # Header in output 2220 header_in_output = param.get("export", {}).get("include_header", False) 2221 2222 # Database 2223 database_source = self.get_connexion() 2224 2225 # Connexion format 2226 connexion_format = self.get_connexion_format() 2227 2228 # Explode infos 2229 if self.get_explode_infos(): 2230 self.explode_infos( 2231 prefix=self.get_explode_infos_prefix(), 2232 fields=self.get_explode_infos_fields(), 2233 force=False, 2234 ) 2235 2236 # if connexion_format in ["sqlite"] or query: 2237 if connexion_format in ["sqlite"]: 2238 2239 # Export in Parquet 2240 random_tmp = "".join( 2241 random.choice(string.ascii_lowercase) for i in range(10) 2242 ) 2243 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2244 tmp_to_remove.append(database_source) 2245 2246 # Table Variants 2247 table_variants = self.get_table_variants() 2248 2249 # Create export query 2250 sql_query_export_subquery = f""" 2251 SELECT * FROM {table_variants} 2252 """ 2253 2254 # Write source file 2255 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2256 2257 # Create database 2258 database = Database( 2259 database=database_source, 2260 table="variants", 2261 header_file=output_header, 2262 conn_config=self.get_connexion_config(), 2263 ) 2264 2265 # Existing colomns header 2266 existing_columns_header = database.get_header_columns_from_database(query=query) 2267 2268 # Sample list 2269 if output_file_type in ["vcf"]: 2270 get_samples = self.get_samples() 2271 get_samples_check = self.get_samples_check() 2272 samples_force = get_samples is not None 2273 sample_list = self.get_header_sample_list( 2274 check=get_samples_check, 2275 samples=get_samples, 2276 samples_force=samples_force, 2277 ) 2278 else: 2279 sample_list = None 2280 2281 # Export file 2282 database.export( 2283 output_database=output_file, 2284 output_header=output_header, 2285 existing_columns_header=existing_columns_header, 2286 parquet_partitions=parquet_partitions, 2287 chunk_size=chunk_size, 2288 threads=threads, 2289 sort=sort, 2290 index=index, 2291 header_in_output=header_in_output, 2292 order_by=order_by, 2293 query=query, 2294 export_header=export_header, 2295 sample_list=sample_list, 2296 ) 2297 2298 # Remove 2299 remove_if_exists(tmp_to_remove) 2300 2301 return (os.path.exists(output_file) or None) and ( 2302 os.path.exists(output_file) or None 2303 )
The export_output function exports data from a VCF file to various formats, including VCF,
CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and
partitioning.
Parameters
- output_file: The
output_fileparameter is a string that specifies the name of the output file where the exported data will be saved - output_header: The
output_headerparameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as theoutput_fileparameter, but with the extension " - export_header: The
export_headerparameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. Ifexport_headeris True, the header will be exported to a file. Ifexport_headeris False, the header will not be, defaults to True - query: The
queryparameter in theexport_outputfunction is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported. This allows you to customize the exported data based on - parquet_partitions: The
parquet_partitionsparameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets - chunk_size: The
chunk_sizeparameter specifies the number of records in a batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files. It helps in optimizing the export process by breaking down the data into manageable chunks for processing and storage - threads: The
threadsparameter in theexport_outputfunction specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If this parameter is not provided, the function will use the default number of threads - sort: The
sortparameter in theexport_outputfunction is a boolean flag that determines whether the output file should be sorted based on genomic coordinates of the variants. Ifsortis set toTrue, the output file will be sorted. Ifsortis set toFalse,, defaults to False - index: The
indexparameter in theexport_outputfunction is a boolean flag that determines whether an index should be created on the output file. Ifindexis set toTrue, an index will be created on the output file. Ifindexis set toFalse, no, defaults to False - order_by: The
order_byparameter in theexport_outputfunction is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format. It allows you to specify the column(s) based on which the output file should be - fields_to_rename: The
fields_to_renameparameter is a dictionary that specifies the mapping of field names to be renamed during the export process. This parameter allows you to customize the output field names before exporting the data. Each key-value pair in the dictionary represents the original field name as the key and the new field name
Returns
The
export_outputfunction returns a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.
2305 def get_extra_infos(self, table: str = None) -> list: 2306 """ 2307 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2308 in the header. 2309 2310 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2311 name of the table from which you want to retrieve the extra columns that are not present in the 2312 header. If the `table` parameter is not provided when calling the function, it will default to 2313 using the variants 2314 :type table: str 2315 :return: A list of columns that are in the specified table but not in the header of the table. 2316 """ 2317 2318 header_columns = [] 2319 2320 if not table: 2321 table = self.get_table_variants(clause="from") 2322 header_columns = self.get_header_columns() 2323 2324 # Check all columns in the database 2325 query = f""" SELECT * FROM {table} LIMIT 1 """ 2326 log.debug(f"query {query}") 2327 table_columns = self.get_query_to_df(query).columns.tolist() 2328 extra_columns = [] 2329 2330 # Construct extra infos (not in header) 2331 for column in table_columns: 2332 if column not in header_columns: 2333 extra_columns.append(column) 2334 2335 return extra_columns
The get_extra_infos function returns a list of columns that are in a specified table but not
in the header.
Parameters
- table: The
tableparameter in theget_extra_infosfunction is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If thetableparameter is not provided when calling the function, it will default to using the variants
Returns
A list of columns that are in the specified table but not in the header of the table.
2337 def get_extra_infos_sql(self, table: str = None) -> str: 2338 """ 2339 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2340 by double quotes 2341 2342 :param table: The name of the table to get the extra infos from. If None, the default table is 2343 used 2344 :type table: str 2345 :return: A string of the extra infos 2346 """ 2347 2348 return ", ".join( 2349 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2350 )
It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes
Parameters
- table: The name of the table to get the extra infos from. If None, the default table is used
Returns
A string of the extra infos
2352 def export_header( 2353 self, 2354 header_name: str = None, 2355 output_file: str = None, 2356 output_file_ext: str = ".hdr", 2357 clean_header: bool = True, 2358 remove_chrom_line: bool = False, 2359 ) -> str: 2360 """ 2361 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2362 specified options, and writes it to a new file. 2363 2364 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2365 this parameter is not specified, the header will be written to the output file 2366 :type header_name: str 2367 :param output_file: The `output_file` parameter in the `export_header` function is used to 2368 specify the name of the output file where the header will be written. If this parameter is not 2369 provided, the header will be written to a temporary file 2370 :type output_file: str 2371 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2372 string that represents the extension of the output header file. By default, it is set to ".hdr" 2373 if not specified by the user. This extension will be appended to the `output_file` name to 2374 create the final, defaults to .hdr 2375 :type output_file_ext: str (optional) 2376 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2377 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2378 `True`, the function will clean the header by modifying certain lines based on a specific 2379 pattern. If `clean_header`, defaults to True 2380 :type clean_header: bool (optional) 2381 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2382 boolean flag that determines whether the #CHROM line should be removed from the header before 2383 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2384 defaults to False 2385 :type remove_chrom_line: bool (optional) 2386 :return: The function `export_header` returns the name of the temporary header file that is 2387 created. 2388 """ 2389 2390 if not header_name and not output_file: 2391 output_file = self.get_output() 2392 2393 if self.get_header(): 2394 2395 # Get header object 2396 header_obj = self.get_header() 2397 2398 # Create database 2399 db_for_header = Database(database=self.get_input()) 2400 2401 # Get real columns in the file 2402 db_header_columns = db_for_header.get_columns() 2403 2404 with tempfile.TemporaryDirectory() as tmpdir: 2405 2406 # Write header file 2407 header_file_tmp = os.path.join(tmpdir, "header") 2408 f = open(header_file_tmp, "w") 2409 vcf.Writer(f, header_obj) 2410 f.close() 2411 2412 # Replace #CHROM line with rel columns 2413 header_list = db_for_header.read_header_file( 2414 header_file=header_file_tmp 2415 ) 2416 header_list[-1] = "\t".join(db_header_columns) 2417 2418 # Remove CHROM line 2419 if remove_chrom_line: 2420 header_list.pop() 2421 2422 # Clean header 2423 if clean_header: 2424 header_list_clean = [] 2425 for head in header_list: 2426 # Clean head for malformed header 2427 head_clean = head 2428 head_clean = re.subn( 2429 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2430 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2431 head_clean, 2432 2, 2433 )[0] 2434 # Write header 2435 header_list_clean.append(head_clean) 2436 header_list = header_list_clean 2437 2438 tmp_header_name = output_file + output_file_ext 2439 2440 f = open(tmp_header_name, "w") 2441 for line in header_list: 2442 f.write(line) 2443 f.close() 2444 2445 return tmp_header_name
The export_header function takes a VCF file, extracts the header, modifies it according to
specified options, and writes it to a new file.
Parameters
- header_name: The
header_nameparameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file - output_file: The
output_fileparameter in theexport_headerfunction is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file - output_file_ext: The
output_file_extparameter in theexport_headerfunction is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to theoutput_filename to create the final, defaults to .hdr - clean_header: The
clean_headerparameter in theexport_headerfunction is a boolean flag that determines whether the header should be cleaned or not. Whenclean_headeris set toTrue, the function will clean the header by modifying certain lines based on a specific pattern. Ifclean_header, defaults to True - remove_chrom_line: The
remove_chrom_lineparameter in theexport_headerfunction is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set toTrue, the #CHROM line will be removed; if set to `, defaults to False
Returns
The function
export_headerreturns the name of the temporary header file that is created.
2447 def export_variant_vcf( 2448 self, 2449 vcf_file, 2450 remove_info: bool = False, 2451 add_samples: bool = True, 2452 list_samples: list = [], 2453 where_clause: str = "", 2454 index: bool = False, 2455 threads: int | None = None, 2456 ) -> bool | None: 2457 """ 2458 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2459 remove INFO field, add samples, and control compression and indexing. 2460 2461 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2462 written to. It is the output file that will contain the filtered VCF data based on the specified 2463 parameters 2464 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2465 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2466 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2467 in, defaults to False 2468 :type remove_info: bool (optional) 2469 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2470 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2471 If set to False, the samples will be removed. The default value is True, defaults to True 2472 :type add_samples: bool (optional) 2473 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2474 in the output VCF file. By default, all samples will be included. If you provide a list of 2475 samples, only those samples will be included in the output file 2476 :type list_samples: list 2477 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2478 determines whether or not to create an index for the output VCF file. If `index` is set to 2479 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2480 :type index: bool (optional) 2481 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2482 number of threads to use for exporting the VCF file. It determines how many parallel threads 2483 will be used during the export process. More threads can potentially speed up the export process 2484 by utilizing multiple cores of the processor. If 2485 :type threads: int | None 2486 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2487 method with various parameters including the output file, query, threads, sort flag, and index 2488 flag. The `export_output` method is responsible for exporting the VCF data based on the 2489 specified parameters and configurations provided in the `export_variant_vcf` function. 2490 """ 2491 2492 # Config 2493 config = self.get_config() 2494 2495 # Extract VCF 2496 log.debug("Export VCF...") 2497 2498 # Table variants 2499 table_variants = self.get_table_variants() 2500 2501 # Threads 2502 if not threads: 2503 threads = self.get_threads() 2504 2505 # Info fields 2506 if remove_info: 2507 if not isinstance(remove_info, str): 2508 remove_info = "." 2509 info_field = f"""'{remove_info}' as INFO""" 2510 else: 2511 info_field = "INFO" 2512 2513 # Samples fields 2514 if add_samples: 2515 if not list_samples: 2516 list_samples = self.get_header_sample_list() 2517 if list_samples: 2518 samples_fields = " , FORMAT , " + " , ".join( 2519 [f""" "{sample}" """ for sample in list_samples] 2520 ) 2521 else: 2522 samples_fields = "" 2523 log.debug(f"samples_fields: {samples_fields}") 2524 else: 2525 samples_fields = "" 2526 2527 # Where clause 2528 if where_clause is None: 2529 where_clause = "" 2530 2531 # Variants 2532 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2533 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2534 log.debug(f"sql_query_select={sql_query_select}") 2535 2536 return self.export_output( 2537 output_file=vcf_file, 2538 output_header=None, 2539 export_header=True, 2540 query=sql_query_select, 2541 parquet_partitions=None, 2542 chunk_size=config.get("chunk_size", None), 2543 threads=threads, 2544 sort=True, 2545 index=index, 2546 order_by=None, 2547 )
The export_variant_vcf function exports a VCF file with specified samples, allowing options to
remove INFO field, add samples, and control compression and indexing.
Parameters
- vcf_file: The
vcf_fileparameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters - remove_info: The
remove_infoparameter in theexport_variant_vcffunction is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set toTrue, the INFO field will be removed. If set toFalse, the INFO field will be included in, defaults to False - add_samples: The
add_samplesparameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True - list_samples: The
list_samplesparameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file - index: The
indexparameter in theexport_variant_vcffunction is a boolean flag that determines whether or not to create an index for the output VCF file. Ifindexis set toTrue, the output VCF file will be indexed using tabix. Ifindex, defaults to False - threads: The
threadsparameter in theexport_variant_vcffunction specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns
The
export_variant_vcffunction returns the result of calling theexport_outputmethod with various parameters including the output file, query, threads, sort flag, and index flag. Theexport_outputmethod is responsible for exporting the VCF data based on the specified parameters and configurations provided in theexport_variant_vcffunction.
2549 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2550 """ 2551 It takes a list of commands and runs them in parallel using the number of threads specified 2552 2553 :param commands: A list of commands to run 2554 :param threads: The number of threads to use, defaults to 1 (optional) 2555 """ 2556 2557 run_parallel_commands(commands, threads)
It takes a list of commands and runs them in parallel using the number of threads specified
Parameters
- commands: A list of commands to run
- threads: The number of threads to use, defaults to 1 (optional)
2559 def get_threads(self, default: int = 1) -> int: 2560 """ 2561 This function returns the number of threads to use for a job, with a default value of 1 if not 2562 specified. 2563 2564 :param default: The `default` parameter in the `get_threads` method is used to specify the 2565 default number of threads to use if no specific value is provided. If no value is provided for 2566 the `threads` parameter in the configuration or input parameters, the `default` value will be 2567 used, defaults to 1 2568 :type default: int (optional) 2569 :return: the number of threads to use for the current job. 2570 """ 2571 2572 # Config 2573 config = self.get_config() 2574 2575 # Param 2576 param = self.get_param() 2577 2578 # Input threads 2579 input_thread = param.get("threads", config.get("threads", None)) 2580 2581 # Check threads 2582 if not input_thread: 2583 threads = default 2584 elif int(input_thread) <= 0: 2585 threads = os.cpu_count() 2586 else: 2587 threads = int(input_thread) 2588 return threads
This function returns the number of threads to use for a job, with a default value of 1 if not specified.
Parameters
- default: The
defaultparameter in theget_threadsmethod is used to specify the default number of threads to use if no specific value is provided. If no value is provided for thethreadsparameter in the configuration or input parameters, thedefaultvalue will be used, defaults to 1
Returns
the number of threads to use for the current job.
2590 def get_memory(self, default: str = None) -> str: 2591 """ 2592 This function retrieves the memory value from parameters or configuration with a default value 2593 if not found. 2594 2595 :param default: The `get_memory` function takes in a default value as a string parameter. This 2596 default value is used as a fallback in case the `memory` parameter is not provided in the 2597 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2598 the function 2599 :type default: str 2600 :return: The `get_memory` function returns a string value representing the memory parameter. If 2601 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2602 return the default value provided as an argument to the function. 2603 """ 2604 2605 # Config 2606 config = self.get_config() 2607 2608 # Param 2609 param = self.get_param() 2610 2611 # Input threads 2612 input_memory = param.get("memory", config.get("memory", None)) 2613 2614 # Check threads 2615 if input_memory: 2616 memory = input_memory 2617 else: 2618 memory = default 2619 2620 return memory
This function retrieves the memory value from parameters or configuration with a default value if not found.
Parameters
- default: The
get_memoryfunction takes in a default value as a string parameter. This default value is used as a fallback in case thememoryparameter is not provided in theparamdictionary or theconfigdictionary. Ifmemoryis not found in either dictionary, the function
Returns
The
get_memoryfunction returns a string value representing the memory parameter. If theinput_memoryis provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.
2622 def update_from_vcf(self, vcf_file: str) -> None: 2623 """ 2624 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2625 2626 :param vcf_file: the path to the VCF file 2627 """ 2628 2629 connexion_format = self.get_connexion_format() 2630 2631 if connexion_format in ["duckdb"]: 2632 self.update_from_vcf_duckdb(vcf_file) 2633 elif connexion_format in ["sqlite"]: 2634 self.update_from_vcf_sqlite(vcf_file)
If the database is duckdb, then use the parquet method, otherwise use the sqlite method
Parameters
- vcf_file: the path to the VCF file
2636 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2637 """ 2638 It takes a VCF file and updates the INFO column of the variants table in the database with the 2639 INFO column of the VCF file 2640 2641 :param vcf_file: the path to the VCF file 2642 """ 2643 2644 # varaints table 2645 table_variants = self.get_table_variants() 2646 2647 # Loading VCF into temporaire table 2648 skip = self.get_header_length(file=vcf_file) 2649 vcf_df = pd.read_csv( 2650 vcf_file, 2651 sep="\t", 2652 engine="c", 2653 skiprows=skip, 2654 header=0, 2655 low_memory=False, 2656 ) 2657 sql_query_update = f""" 2658 UPDATE {table_variants} as table_variants 2659 SET INFO = concat( 2660 CASE 2661 WHEN INFO NOT IN ('', '.') 2662 THEN INFO 2663 ELSE '' 2664 END, 2665 ( 2666 SELECT 2667 concat( 2668 CASE 2669 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2670 THEN ';' 2671 ELSE '' 2672 END 2673 , 2674 CASE 2675 WHEN table_parquet.INFO NOT IN ('','.') 2676 THEN table_parquet.INFO 2677 ELSE '' 2678 END 2679 ) 2680 FROM vcf_df as table_parquet 2681 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2682 AND table_parquet.\"POS\" = table_variants.\"POS\" 2683 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2684 AND table_parquet.\"REF\" = table_variants.\"REF\" 2685 AND table_parquet.INFO NOT IN ('','.') 2686 ) 2687 ) 2688 ; 2689 """ 2690 self.conn.execute(sql_query_update)
It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file
Parameters
- vcf_file: the path to the VCF file
2692 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2693 """ 2694 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2695 table, then updates the INFO column of the variants table with the INFO column of the temporary 2696 table 2697 2698 :param vcf_file: The path to the VCF file you want to update the database with 2699 """ 2700 2701 # Create a temporary table for the VCF 2702 table_vcf = "tmp_vcf" 2703 sql_create = ( 2704 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2705 ) 2706 self.conn.execute(sql_create) 2707 2708 # Loading VCF into temporaire table 2709 vcf_df = pd.read_csv( 2710 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2711 ) 2712 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2713 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2714 2715 # Update table 'variants' with VCF data 2716 # warning: CONCAT as || operator 2717 sql_query_update = f""" 2718 UPDATE variants as table_variants 2719 SET INFO = CASE 2720 WHEN INFO NOT IN ('', '.') 2721 THEN INFO 2722 ELSE '' 2723 END || 2724 ( 2725 SELECT 2726 CASE 2727 WHEN table_variants.INFO NOT IN ('','.') 2728 AND table_vcf.INFO NOT IN ('','.') 2729 THEN ';' 2730 ELSE '' 2731 END || 2732 CASE 2733 WHEN table_vcf.INFO NOT IN ('','.') 2734 THEN table_vcf.INFO 2735 ELSE '' 2736 END 2737 FROM {table_vcf} as table_vcf 2738 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2739 AND table_vcf.\"POS\" = table_variants.\"POS\" 2740 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2741 AND table_vcf.\"REF\" = table_variants.\"REF\" 2742 ) 2743 """ 2744 self.conn.execute(sql_query_update) 2745 2746 # Drop temporary table 2747 sql_drop = f"DROP TABLE {table_vcf}" 2748 self.conn.execute(sql_drop)
It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table
Parameters
- vcf_file: The path to the VCF file you want to update the database with
2750 def drop_variants_table(self) -> None: 2751 """ 2752 > This function drops the variants table 2753 """ 2754 2755 table_variants = self.get_table_variants() 2756 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2757 self.conn.execute(sql_table_variants)
This function drops the variants table
2759 def set_variant_id( 2760 self, variant_id_column: str = "variant_id", force: bool = None 2761 ) -> str: 2762 """ 2763 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2764 `#CHROM`, `POS`, `REF`, and `ALT` columns 2765 2766 :param variant_id_column: The name of the column to be created in the variants table, defaults 2767 to variant_id 2768 :type variant_id_column: str (optional) 2769 :param force: If True, the variant_id column will be created even if it already exists 2770 :type force: bool 2771 :return: The name of the column that contains the variant_id 2772 """ 2773 2774 # Assembly 2775 assembly = self.get_param().get( 2776 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2777 ) 2778 2779 # INFO/Tag prefix 2780 prefix = self.get_explode_infos_prefix() 2781 2782 # Explode INFO/SVTYPE 2783 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2784 2785 # variants table 2786 table_variants = self.get_table_variants() 2787 2788 # variant_id column 2789 if not variant_id_column: 2790 variant_id_column = "variant_id" 2791 2792 # Creta variant_id column 2793 if "variant_id" not in self.get_extra_infos() or force: 2794 2795 # Create column 2796 self.add_column( 2797 table_name=table_variants, 2798 column_name=variant_id_column, 2799 column_type="UBIGINT", 2800 default_value="0", 2801 ) 2802 2803 # Update column 2804 self.conn.execute( 2805 f""" 2806 UPDATE {table_variants} 2807 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2808 """ 2809 ) 2810 2811 # Remove added columns 2812 for added_column in added_columns: 2813 self.drop_column(column=added_column) 2814 2815 # return variant_id column name 2816 return variant_id_column
It adds a column to the variants table called variant_id and populates it with a hash of the
#CHROM, POS, REF, and ALT columns
Parameters
- variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
- force: If True, the variant_id column will be created even if it already exists
Returns
The name of the column that contains the variant_id
2818 def get_variant_id_column( 2819 self, variant_id_column: str = "variant_id", force: bool = None 2820 ) -> str: 2821 """ 2822 This function returns the variant_id column name 2823 2824 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2825 defaults to variant_id 2826 :type variant_id_column: str (optional) 2827 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2828 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2829 if it is not already set, or if it is set 2830 :type force: bool 2831 :return: The variant_id column name. 2832 """ 2833 2834 return self.set_variant_id(variant_id_column=variant_id_column, force=force)
This function returns the variant_id column name
Parameters
- variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
- force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns
The variant_id column name.
2840 def scan_databases( 2841 self, 2842 database_formats: list = ["parquet"], 2843 database_releases: list = ["current"], 2844 ) -> dict: 2845 """ 2846 The function `scan_databases` scans for available databases based on specified formats and 2847 releases. 2848 2849 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2850 of the databases to be scanned. In this case, the accepted format is "parquet" 2851 :type database_formats: list ["parquet"] 2852 :param database_releases: The `database_releases` parameter is a list that specifies the 2853 releases of the databases to be scanned. In the provided function, the default value for 2854 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2855 databases that are in the "current" 2856 :type database_releases: list 2857 :return: The function `scan_databases` returns a dictionary containing information about 2858 databases that match the specified formats and releases. 2859 """ 2860 2861 # Config 2862 config = self.get_config() 2863 2864 # Param 2865 param = self.get_param() 2866 2867 # Param - Assembly 2868 assembly = param.get("assembly", config.get("assembly", None)) 2869 if not assembly: 2870 assembly = DEFAULT_ASSEMBLY 2871 log.warning(f"Default assembly '{assembly}'") 2872 2873 # Scan for availabled databases 2874 log.info( 2875 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2876 ) 2877 databases_infos_dict = databases_infos( 2878 database_folder_releases=database_releases, 2879 database_formats=database_formats, 2880 assembly=assembly, 2881 config=config, 2882 ) 2883 log.info( 2884 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2885 ) 2886 2887 return databases_infos_dict
The function scan_databases scans for available databases based on specified formats and
releases.
Parameters
- database_formats: The
database_formatsparameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet" - database_releases: The
database_releasesparameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value fordatabase_releasesis set to["current"], meaning that by default, the function will scan databases that are in the "current"
Returns
The function
scan_databasesreturns a dictionary containing information about databases that match the specified formats and releases.
2889 def annotation(self) -> None: 2890 """ 2891 It annotates the VCF file with the annotations specified in the config file. 2892 """ 2893 2894 # Config 2895 config = self.get_config() 2896 2897 # Param 2898 param = self.get_param() 2899 2900 # Param - Assembly 2901 assembly = param.get("assembly", config.get("assembly", None)) 2902 if not assembly: 2903 assembly = DEFAULT_ASSEMBLY 2904 log.warning(f"Default assembly '{assembly}'") 2905 2906 # annotations databases folders 2907 annotations_databases = set( 2908 config.get("folders", {}) 2909 .get("databases", {}) 2910 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2911 + config.get("folders", {}) 2912 .get("databases", {}) 2913 .get("parquet", ["~/howard/databases/parquet/current"]) 2914 + config.get("folders", {}) 2915 .get("databases", {}) 2916 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2917 ) 2918 2919 # Get param annotations 2920 if param.get("annotations", None) and isinstance( 2921 param.get("annotations", None), str 2922 ): 2923 log.debug(param.get("annotations", None)) 2924 param_annotation_list = param.get("annotations").split(",") 2925 else: 2926 param_annotation_list = [] 2927 2928 # Each tools param 2929 if param.get("annotation_parquet", None) != None: 2930 log.debug( 2931 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2932 ) 2933 if isinstance(param.get("annotation_parquet", None), list): 2934 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2935 else: 2936 param_annotation_list.append(param.get("annotation_parquet")) 2937 if param.get("annotation_snpsift", None) != None: 2938 if isinstance(param.get("annotation_snpsift", None), list): 2939 param_annotation_list.append( 2940 "snpsift:" 2941 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2942 ) 2943 else: 2944 param_annotation_list.append( 2945 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2946 ) 2947 if param.get("annotation_snpeff", None) != None: 2948 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2949 if param.get("annotation_bcftools", None) != None: 2950 if isinstance(param.get("annotation_bcftools", None), list): 2951 param_annotation_list.append( 2952 "bcftools:" 2953 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2954 ) 2955 else: 2956 param_annotation_list.append( 2957 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2958 ) 2959 if param.get("annotation_annovar", None) != None: 2960 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2961 if param.get("annotation_exomiser", None) != None: 2962 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2963 if param.get("annotation_splice", None) != None: 2964 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2965 2966 # Merge param annotations list 2967 param["annotations"] = ",".join(param_annotation_list) 2968 2969 # debug 2970 log.debug(f"param_annotations={param['annotations']}") 2971 2972 if param.get("annotations"): 2973 2974 # Log 2975 # log.info("Annotations - Check annotation parameters") 2976 2977 if not "annotation" in param: 2978 param["annotation"] = {} 2979 2980 # List of annotations parameters 2981 annotations_list_input = {} 2982 if isinstance(param.get("annotations", None), str): 2983 annotation_file_list = [ 2984 value for value in param.get("annotations", "").split(",") 2985 ] 2986 for annotation_file in annotation_file_list: 2987 annotations_list_input[annotation_file.strip()] = {"INFO": None} 2988 else: 2989 annotations_list_input = param.get("annotations", {}) 2990 2991 log.info(f"Quick Annotations:") 2992 for annotation_key in list(annotations_list_input.keys()): 2993 log.info(f" {annotation_key}") 2994 2995 # List of annotations and associated fields 2996 annotations_list = {} 2997 2998 for annotation_file in annotations_list_input: 2999 3000 # Explode annotations if ALL 3001 if ( 3002 annotation_file.upper() == "ALL" 3003 or annotation_file.upper().startswith("ALL:") 3004 ): 3005 3006 # check ALL parameters (formats, releases) 3007 annotation_file_split = annotation_file.split(":") 3008 database_formats = "parquet" 3009 database_releases = "current" 3010 for annotation_file_option in annotation_file_split[1:]: 3011 database_all_options_split = annotation_file_option.split("=") 3012 if database_all_options_split[0] == "format": 3013 database_formats = database_all_options_split[1].split("+") 3014 if database_all_options_split[0] == "release": 3015 database_releases = database_all_options_split[1].split("+") 3016 3017 # Scan for availabled databases 3018 databases_infos_dict = self.scan_databases( 3019 database_formats=database_formats, 3020 database_releases=database_releases, 3021 ) 3022 3023 # Add found databases in annotation parameters 3024 for database_infos in databases_infos_dict.keys(): 3025 annotations_list[database_infos] = {"INFO": None} 3026 3027 else: 3028 annotations_list[annotation_file] = annotations_list_input[ 3029 annotation_file 3030 ] 3031 3032 # Check each databases 3033 if len(annotations_list): 3034 3035 log.info( 3036 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3037 ) 3038 3039 for annotation_file in annotations_list: 3040 3041 # Init 3042 annotations = annotations_list.get(annotation_file, None) 3043 3044 # Annotation snpEff 3045 if annotation_file.startswith("snpeff"): 3046 3047 log.debug(f"Quick Annotation snpEff") 3048 3049 if "snpeff" not in param["annotation"]: 3050 param["annotation"]["snpeff"] = {} 3051 3052 if "options" not in param["annotation"]["snpeff"]: 3053 param["annotation"]["snpeff"]["options"] = "" 3054 3055 # snpEff options in annotations 3056 param["annotation"]["snpeff"]["options"] = "".join( 3057 annotation_file.split(":")[1:] 3058 ) 3059 3060 # Annotation Annovar 3061 elif annotation_file.startswith("annovar"): 3062 3063 log.debug(f"Quick Annotation Annovar") 3064 3065 if "annovar" not in param["annotation"]: 3066 param["annotation"]["annovar"] = {} 3067 3068 if "annotations" not in param["annotation"]["annovar"]: 3069 param["annotation"]["annovar"]["annotations"] = {} 3070 3071 # Options 3072 annotation_file_split = annotation_file.split(":") 3073 for annotation_file_annotation in annotation_file_split[1:]: 3074 if annotation_file_annotation: 3075 param["annotation"]["annovar"]["annotations"][ 3076 annotation_file_annotation 3077 ] = annotations 3078 3079 # Annotation Exomiser 3080 elif annotation_file.startswith("exomiser"): 3081 3082 log.debug(f"Quick Annotation Exomiser") 3083 3084 param["annotation"]["exomiser"] = params_string_to_dict( 3085 annotation_file 3086 ) 3087 3088 # Annotation Splice 3089 elif annotation_file.startswith("splice"): 3090 3091 log.debug(f"Quick Annotation Splice") 3092 3093 param["annotation"]["splice"] = params_string_to_dict( 3094 annotation_file 3095 ) 3096 3097 # Annotation Parquet or BCFTOOLS 3098 else: 3099 3100 # Tools detection 3101 if annotation_file.startswith("bcftools:"): 3102 annotation_tool_initial = "bcftools" 3103 annotation_file = ":".join(annotation_file.split(":")[1:]) 3104 elif annotation_file.startswith("snpsift:"): 3105 annotation_tool_initial = "snpsift" 3106 annotation_file = ":".join(annotation_file.split(":")[1:]) 3107 elif annotation_file.startswith("bigwig:"): 3108 annotation_tool_initial = "bigwig" 3109 annotation_file = ":".join(annotation_file.split(":")[1:]) 3110 else: 3111 annotation_tool_initial = None 3112 3113 # list of files 3114 annotation_file_list = annotation_file.replace("+", ":").split( 3115 ":" 3116 ) 3117 3118 for annotation_file in annotation_file_list: 3119 3120 if annotation_file: 3121 3122 # Annotation tool initial 3123 annotation_tool = annotation_tool_initial 3124 3125 # Find file 3126 annotation_file_found = None 3127 3128 if os.path.exists(annotation_file): 3129 annotation_file_found = annotation_file 3130 elif os.path.exists(full_path(annotation_file)): 3131 annotation_file_found = full_path(annotation_file) 3132 else: 3133 # Find within assembly folders 3134 for annotations_database in annotations_databases: 3135 found_files = find_all( 3136 annotation_file, 3137 os.path.join( 3138 annotations_database, assembly 3139 ), 3140 ) 3141 if len(found_files) > 0: 3142 annotation_file_found = found_files[0] 3143 break 3144 if not annotation_file_found and not assembly: 3145 # Find within folders 3146 for ( 3147 annotations_database 3148 ) in annotations_databases: 3149 found_files = find_all( 3150 annotation_file, annotations_database 3151 ) 3152 if len(found_files) > 0: 3153 annotation_file_found = found_files[0] 3154 break 3155 log.debug( 3156 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3157 ) 3158 3159 # Full path 3160 annotation_file_found = full_path(annotation_file_found) 3161 3162 if annotation_file_found: 3163 3164 database = Database(database=annotation_file_found) 3165 quick_annotation_format = database.get_format() 3166 quick_annotation_is_compressed = ( 3167 database.is_compressed() 3168 ) 3169 quick_annotation_is_indexed = os.path.exists( 3170 f"{annotation_file_found}.tbi" 3171 ) 3172 bcftools_preference = False 3173 3174 # Check Annotation Tool 3175 if not annotation_tool: 3176 if ( 3177 bcftools_preference 3178 and quick_annotation_format 3179 in ["vcf", "bed"] 3180 and quick_annotation_is_compressed 3181 and quick_annotation_is_indexed 3182 ): 3183 annotation_tool = "bcftools" 3184 elif quick_annotation_format in [ 3185 "vcf", 3186 "bed", 3187 "tsv", 3188 "tsv", 3189 "csv", 3190 "json", 3191 "tbl", 3192 "parquet", 3193 "duckdb", 3194 ]: 3195 annotation_tool = "parquet" 3196 elif quick_annotation_format in ["bw"]: 3197 annotation_tool = "bigwig" 3198 else: 3199 log.error( 3200 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3201 ) 3202 raise ValueError( 3203 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3204 ) 3205 3206 log.debug( 3207 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3208 ) 3209 3210 # Annotation Tool dispatch 3211 if annotation_tool: 3212 if annotation_tool not in param["annotation"]: 3213 param["annotation"][annotation_tool] = {} 3214 if ( 3215 "annotations" 3216 not in param["annotation"][annotation_tool] 3217 ): 3218 param["annotation"][annotation_tool][ 3219 "annotations" 3220 ] = {} 3221 param["annotation"][annotation_tool][ 3222 "annotations" 3223 ][annotation_file_found] = annotations 3224 3225 else: 3226 log.warning( 3227 f"Quick Annotation File {annotation_file} does NOT exist" 3228 ) 3229 3230 self.set_param(param) 3231 3232 if param.get("annotation", None): 3233 log.info("Annotations") 3234 if param.get("annotation", {}).get("parquet", None): 3235 log.info("Annotations 'parquet'...") 3236 self.annotation_parquet() 3237 if param.get("annotation", {}).get("bcftools", None): 3238 log.info("Annotations 'bcftools'...") 3239 self.annotation_bcftools() 3240 if param.get("annotation", {}).get("snpsift", None): 3241 log.info("Annotations 'snpsift'...") 3242 self.annotation_snpsift() 3243 if param.get("annotation", {}).get("bigwig", None): 3244 log.info("Annotations 'bigwig'...") 3245 self.annotation_bigwig() 3246 if param.get("annotation", {}).get("annovar", None): 3247 log.info("Annotations 'annovar'...") 3248 self.annotation_annovar() 3249 if param.get("annotation", {}).get("snpeff", None): 3250 log.info("Annotations 'snpeff'...") 3251 self.annotation_snpeff() 3252 if param.get("annotation", {}).get("exomiser", None) is not None: 3253 log.info("Annotations 'exomiser'...") 3254 self.annotation_exomiser() 3255 if param.get("annotation", {}).get("splice", None) is not None: 3256 log.info("Annotations 'splice' ...") 3257 self.annotation_splice() 3258 3259 # Explode INFOS fields into table fields 3260 if self.get_explode_infos(): 3261 self.explode_infos( 3262 prefix=self.get_explode_infos_prefix(), 3263 fields=self.get_explode_infos_fields(), 3264 force=True, 3265 )
It annotates the VCF file with the annotations specified in the config file.
3267 def annotation_bigwig(self, threads: int = None) -> None: 3268 """ 3269 The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases. 3270 3271 :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the 3272 number of threads to be used for parallel processing during the annotation process. If the 3273 `threads` parameter is not provided, the method will attempt to determine the optimal number of 3274 threads to use based on the system configuration 3275 :type threads: int 3276 :return: True 3277 """ 3278 3279 # DEBUG 3280 log.debug("Start annotation with bigwig databases") 3281 3282 # # Threads 3283 # if not threads: 3284 # threads = self.get_threads() 3285 # log.debug("Threads: " + str(threads)) 3286 3287 # Config 3288 config = self.get_config() 3289 log.debug("Config: " + str(config)) 3290 3291 # Config - BCFTools databases folders 3292 databases_folders = set( 3293 self.get_config() 3294 .get("folders", {}) 3295 .get("databases", {}) 3296 .get("annotations", ["."]) 3297 + self.get_config() 3298 .get("folders", {}) 3299 .get("databases", {}) 3300 .get("bigwig", ["."]) 3301 ) 3302 log.debug("Databases annotations: " + str(databases_folders)) 3303 3304 # Param 3305 annotations = ( 3306 self.get_param() 3307 .get("annotation", {}) 3308 .get("bigwig", {}) 3309 .get("annotations", None) 3310 ) 3311 log.debug("Annotations: " + str(annotations)) 3312 3313 # Assembly 3314 assembly = self.get_param().get( 3315 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3316 ) 3317 3318 # Data 3319 table_variants = self.get_table_variants() 3320 3321 # Check if not empty 3322 log.debug("Check if not empty") 3323 sql_query_chromosomes = ( 3324 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3325 ) 3326 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3327 if not sql_query_chromosomes_df["count"][0]: 3328 log.info(f"VCF empty") 3329 return 3330 3331 # VCF header 3332 vcf_reader = self.get_header() 3333 log.debug("Initial header: " + str(vcf_reader.infos)) 3334 3335 # Existing annotations 3336 for vcf_annotation in self.get_header().infos: 3337 3338 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3339 log.debug( 3340 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3341 ) 3342 3343 if annotations: 3344 3345 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3346 3347 # Export VCF file 3348 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3349 3350 # annotation_bigwig_config 3351 annotation_bigwig_config_list = [] 3352 3353 for annotation in annotations: 3354 annotation_fields = annotations[annotation] 3355 3356 # Annotation Name 3357 annotation_name = os.path.basename(annotation) 3358 3359 if not annotation_fields: 3360 annotation_fields = {"INFO": None} 3361 3362 log.debug(f"Annotation '{annotation_name}'") 3363 log.debug( 3364 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3365 ) 3366 3367 # Create Database 3368 database = Database( 3369 database=annotation, 3370 databases_folders=databases_folders, 3371 assembly=assembly, 3372 ) 3373 3374 # Find files 3375 db_file = database.get_database() 3376 db_file = full_path(db_file) 3377 db_hdr_file = database.get_header_file() 3378 db_hdr_file = full_path(db_hdr_file) 3379 db_file_type = database.get_format() 3380 3381 # If db_file is http ? 3382 if database.get_database().startswith("http"): 3383 3384 # Datbase is HTTP URL 3385 db_file_is_http = True 3386 3387 # DB file keep as URL 3388 db_file = database.get_database() 3389 log.warning( 3390 f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)" 3391 ) 3392 3393 # Retrieve automatic annotation field name 3394 annotation_field = clean_annotation_field( 3395 os.path.basename(db_file).replace(".bw", "") 3396 ) 3397 log.debug( 3398 f"Create header file with annotation field '{annotation_field}' is an HTTP URL" 3399 ) 3400 3401 # Create automatic header file 3402 db_hdr_file = os.path.join(tmp_dir, "header.hdr") 3403 with open(db_hdr_file, "w") as f: 3404 f.write("##fileformat=VCFv4.2\n") 3405 f.write( 3406 f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""" 3407 ) 3408 f.write(f"#CHROM START END {annotation_field}\n") 3409 3410 else: 3411 3412 # Datbase is NOT HTTP URL 3413 db_file_is_http = False 3414 3415 # Check index - try to create if not exists 3416 if ( 3417 db_file is None 3418 or db_hdr_file is None 3419 or (not os.path.exists(db_file) and not db_file_is_http) 3420 or not os.path.exists(db_hdr_file) 3421 or not db_file_type in ["bw"] 3422 ): 3423 # if False: 3424 log.error("Annotation failed: database not valid") 3425 log.error(f"Annotation annotation file: {db_file}") 3426 log.error(f"Annotation annotation file type: {db_file_type}") 3427 log.error(f"Annotation annotation header: {db_hdr_file}") 3428 raise ValueError( 3429 f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}" 3430 ) 3431 else: 3432 3433 # Log 3434 log.debug( 3435 f"Annotation '{annotation}' - file: " 3436 + str(db_file) 3437 + " and " 3438 + str(db_hdr_file) 3439 ) 3440 3441 # Load header as VCF object 3442 db_hdr_vcf = Variants(input=db_hdr_file) 3443 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3444 log.debug( 3445 "Annotation database header: " 3446 + str(db_hdr_vcf_header_infos) 3447 ) 3448 3449 # For all fields in database 3450 annotation_fields_full = False 3451 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3452 annotation_fields = { 3453 key: key for key in db_hdr_vcf_header_infos 3454 } 3455 log.debug( 3456 "Annotation database header - All annotations added: " 3457 + str(annotation_fields) 3458 ) 3459 annotation_fields_full = True 3460 3461 # Init 3462 cyvcf2_header_rename_dict = {} 3463 cyvcf2_header_list = [] 3464 cyvcf2_header_indexes = {} 3465 3466 # process annotation fields 3467 for annotation_field in annotation_fields: 3468 3469 # New annotation name 3470 annotation_field_new = annotation_fields[annotation_field] 3471 3472 # Check annotation field and index in header 3473 if ( 3474 annotation_field 3475 in db_hdr_vcf.get_header_columns_as_list() 3476 ): 3477 annotation_field_index = ( 3478 db_hdr_vcf.get_header_columns_as_list().index( 3479 annotation_field 3480 ) 3481 - 3 3482 ) 3483 cyvcf2_header_indexes[annotation_field_new] = ( 3484 annotation_field_index 3485 ) 3486 else: 3487 msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'" 3488 log.error(msg_err) 3489 raise ValueError(msg_err) 3490 3491 # Append annotation field in cyvcf2 header list 3492 cyvcf2_header_rename_dict[annotation_field_new] = ( 3493 db_hdr_vcf_header_infos[annotation_field].id 3494 ) 3495 cyvcf2_header_list.append( 3496 { 3497 "ID": annotation_field_new, 3498 "Number": db_hdr_vcf_header_infos[ 3499 annotation_field 3500 ].num, 3501 "Type": db_hdr_vcf_header_infos[ 3502 annotation_field 3503 ].type, 3504 "Description": db_hdr_vcf_header_infos[ 3505 annotation_field 3506 ].desc, 3507 } 3508 ) 3509 3510 # Add header on VCF 3511 vcf_reader.infos[annotation_field_new] = vcf.parser._Info( 3512 annotation_field_new, 3513 db_hdr_vcf_header_infos[annotation_field].num, 3514 db_hdr_vcf_header_infos[annotation_field].type, 3515 db_hdr_vcf_header_infos[annotation_field].desc, 3516 "HOWARD BigWig annotation", 3517 "unknown", 3518 self.code_type_map[ 3519 db_hdr_vcf_header_infos[annotation_field].type 3520 ], 3521 ) 3522 3523 # Load bigwig database 3524 bw_db = pyBigWig.open(db_file) 3525 if bw_db.isBigWig(): 3526 log.debug(f"Database '{db_file}' is in 'BigWig' format") 3527 else: 3528 msg_err = f"Database '{db_file}' is NOT in 'BigWig' format" 3529 log.error(msg_err) 3530 raise ValueError(msg_err) 3531 3532 annotation_bigwig_config_list.append( 3533 { 3534 "db_file": db_file, 3535 "bw_db": bw_db, 3536 "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict, 3537 "cyvcf2_header_list": cyvcf2_header_list, 3538 "cyvcf2_header_indexes": cyvcf2_header_indexes, 3539 } 3540 ) 3541 3542 # Annotate 3543 if annotation_bigwig_config_list: 3544 3545 # Annotation config 3546 log.debug( 3547 f"annotation_bigwig_config={annotation_bigwig_config_list}" 3548 ) 3549 3550 # Export VCF file 3551 self.export_variant_vcf( 3552 vcf_file=tmp_vcf_name, 3553 remove_info=True, 3554 add_samples=False, 3555 index=True, 3556 ) 3557 3558 # Load input tmp file 3559 input_vcf = cyvcf2.VCF(tmp_vcf_name) 3560 3561 # Add header in input file 3562 for annotation_bigwig_config in annotation_bigwig_config_list: 3563 for cyvcf2_header_field in annotation_bigwig_config.get( 3564 "cyvcf2_header_list", [] 3565 ): 3566 log.info( 3567 f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'" 3568 ) 3569 input_vcf.add_info_to_header(cyvcf2_header_field) 3570 3571 # Create output VCF file 3572 output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz") 3573 output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf) 3574 3575 # Fetch variants 3576 log.info(f"Annotations 'bigwig' start...") 3577 for variant in input_vcf: 3578 3579 for annotation_bigwig_config in annotation_bigwig_config_list: 3580 3581 # DB and indexes 3582 bw_db = annotation_bigwig_config.get("bw_db", None) 3583 cyvcf2_header_indexes = annotation_bigwig_config.get( 3584 "cyvcf2_header_indexes", None 3585 ) 3586 3587 # Retrieve value from chrom pos 3588 res = bw_db.values( 3589 variant.CHROM, variant.POS - 1, variant.POS 3590 ) 3591 3592 # For each annotation fields (and indexes) 3593 for cyvcf2_header_index in cyvcf2_header_indexes: 3594 3595 # If value is NOT nNone 3596 if not np.isnan( 3597 res[cyvcf2_header_indexes[cyvcf2_header_index]] 3598 ): 3599 variant.INFO[cyvcf2_header_index] = res[ 3600 cyvcf2_header_indexes[cyvcf2_header_index] 3601 ] 3602 3603 # Add record in output file 3604 output_vcf.write_record(variant) 3605 3606 # Log 3607 log.debug(f"Annotation done.") 3608 3609 # Close and write file 3610 log.info(f"Annotations 'bigwig' write...") 3611 output_vcf.close() 3612 log.debug(f"Write done.") 3613 3614 # Update variants 3615 log.info(f"Annotations 'bigwig' update...") 3616 self.update_from_vcf(output_vcf_file) 3617 log.debug(f"Update done.") 3618 3619 return True
The function annotation_bigwig annotates variants in a VCF file using bigwig databases.
Parameters
- threads: The
threadsparameter in theannotation_bigwigmethod is used to specify the number of threads to be used for parallel processing during the annotation process. If thethreadsparameter is not provided, the method will attempt to determine the optimal number of threads to use based on the system configuration
Returns
True
3621 def annotation_snpsift(self, threads: int = None) -> None: 3622 """ 3623 This function annotate with bcftools 3624 3625 :param threads: Number of threads to use 3626 :return: the value of the variable "return_value". 3627 """ 3628 3629 # DEBUG 3630 log.debug("Start annotation with bcftools databases") 3631 3632 # Threads 3633 if not threads: 3634 threads = self.get_threads() 3635 log.debug("Threads: " + str(threads)) 3636 3637 # Config 3638 config = self.get_config() 3639 log.debug("Config: " + str(config)) 3640 3641 # Config - snpSift 3642 snpsift_bin_command = get_bin_command( 3643 bin="SnpSift.jar", 3644 tool="snpsift", 3645 bin_type="jar", 3646 config=config, 3647 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3648 ) 3649 if not snpsift_bin_command: 3650 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3651 log.error(msg_err) 3652 raise ValueError(msg_err) 3653 3654 # Config - bcftools 3655 bcftools_bin_command = get_bin_command( 3656 bin="bcftools", 3657 tool="bcftools", 3658 bin_type="bin", 3659 config=config, 3660 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3661 ) 3662 if not bcftools_bin_command: 3663 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3664 log.error(msg_err) 3665 raise ValueError(msg_err) 3666 3667 # Config - BCFTools databases folders 3668 databases_folders = set( 3669 self.get_config() 3670 .get("folders", {}) 3671 .get("databases", {}) 3672 .get("annotations", ["."]) 3673 + self.get_config() 3674 .get("folders", {}) 3675 .get("databases", {}) 3676 .get("bcftools", ["."]) 3677 ) 3678 log.debug("Databases annotations: " + str(databases_folders)) 3679 3680 # Param 3681 annotations = ( 3682 self.get_param() 3683 .get("annotation", {}) 3684 .get("snpsift", {}) 3685 .get("annotations", None) 3686 ) 3687 log.debug("Annotations: " + str(annotations)) 3688 3689 # Assembly 3690 assembly = self.get_param().get( 3691 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3692 ) 3693 3694 # Data 3695 table_variants = self.get_table_variants() 3696 3697 # Check if not empty 3698 log.debug("Check if not empty") 3699 sql_query_chromosomes = ( 3700 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3701 ) 3702 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3703 if not sql_query_chromosomes_df["count"][0]: 3704 log.info(f"VCF empty") 3705 return 3706 3707 # VCF header 3708 vcf_reader = self.get_header() 3709 log.debug("Initial header: " + str(vcf_reader.infos)) 3710 3711 # Existing annotations 3712 for vcf_annotation in self.get_header().infos: 3713 3714 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3715 log.debug( 3716 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3717 ) 3718 3719 if annotations: 3720 3721 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3722 3723 # Export VCF file 3724 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3725 3726 # Init 3727 commands = {} 3728 3729 for annotation in annotations: 3730 annotation_fields = annotations[annotation] 3731 3732 # Annotation Name 3733 annotation_name = os.path.basename(annotation) 3734 3735 if not annotation_fields: 3736 annotation_fields = {"INFO": None} 3737 3738 log.debug(f"Annotation '{annotation_name}'") 3739 log.debug( 3740 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3741 ) 3742 3743 # Create Database 3744 database = Database( 3745 database=annotation, 3746 databases_folders=databases_folders, 3747 assembly=assembly, 3748 ) 3749 3750 # Find files 3751 db_file = database.get_database() 3752 db_file = full_path(db_file) 3753 db_hdr_file = database.get_header_file() 3754 db_hdr_file = full_path(db_hdr_file) 3755 db_file_type = database.get_format() 3756 db_tbi_file = f"{db_file}.tbi" 3757 db_file_compressed = database.is_compressed() 3758 3759 # Check if compressed 3760 if not db_file_compressed: 3761 log.error( 3762 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3763 ) 3764 raise ValueError( 3765 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3766 ) 3767 3768 # Check if indexed 3769 if not os.path.exists(db_tbi_file): 3770 log.error( 3771 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3772 ) 3773 raise ValueError( 3774 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3775 ) 3776 3777 # Check index - try to create if not exists 3778 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3779 log.error("Annotation failed: database not valid") 3780 log.error(f"Annotation annotation file: {db_file}") 3781 log.error(f"Annotation annotation header: {db_hdr_file}") 3782 log.error(f"Annotation annotation index: {db_tbi_file}") 3783 raise ValueError( 3784 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3785 ) 3786 else: 3787 3788 log.debug( 3789 f"Annotation '{annotation}' - file: " 3790 + str(db_file) 3791 + " and " 3792 + str(db_hdr_file) 3793 ) 3794 3795 # Load header as VCF object 3796 db_hdr_vcf = Variants(input=db_hdr_file) 3797 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3798 log.debug( 3799 "Annotation database header: " 3800 + str(db_hdr_vcf_header_infos) 3801 ) 3802 3803 # For all fields in database 3804 annotation_fields_full = False 3805 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3806 annotation_fields = { 3807 key: key for key in db_hdr_vcf_header_infos 3808 } 3809 log.debug( 3810 "Annotation database header - All annotations added: " 3811 + str(annotation_fields) 3812 ) 3813 annotation_fields_full = True 3814 3815 # # Create file for field rename 3816 # log.debug("Create file for field rename") 3817 # tmp_rename = NamedTemporaryFile( 3818 # prefix=self.get_prefix(), 3819 # dir=self.get_tmp_dir(), 3820 # suffix=".rename", 3821 # delete=False, 3822 # ) 3823 # tmp_rename_name = tmp_rename.name 3824 # tmp_files.append(tmp_rename_name) 3825 3826 # Number of fields 3827 nb_annotation_field = 0 3828 annotation_list = [] 3829 annotation_infos_rename_list = [] 3830 3831 for annotation_field in annotation_fields: 3832 3833 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3834 annotation_fields_new_name = annotation_fields.get( 3835 annotation_field, annotation_field 3836 ) 3837 if not annotation_fields_new_name: 3838 annotation_fields_new_name = annotation_field 3839 3840 # Check if field is in DB and if field is not elready in input data 3841 if ( 3842 annotation_field in db_hdr_vcf.get_header().infos 3843 and annotation_fields_new_name 3844 not in self.get_header().infos 3845 ): 3846 3847 log.info( 3848 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3849 ) 3850 3851 # BCFTools annotate param to rename fields 3852 if annotation_field != annotation_fields_new_name: 3853 annotation_infos_rename_list.append( 3854 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3855 ) 3856 3857 # Add INFO field to header 3858 db_hdr_vcf_header_infos_number = ( 3859 db_hdr_vcf_header_infos[annotation_field].num or "." 3860 ) 3861 db_hdr_vcf_header_infos_type = ( 3862 db_hdr_vcf_header_infos[annotation_field].type 3863 or "String" 3864 ) 3865 db_hdr_vcf_header_infos_description = ( 3866 db_hdr_vcf_header_infos[annotation_field].desc 3867 or f"{annotation_field} description" 3868 ) 3869 db_hdr_vcf_header_infos_source = ( 3870 db_hdr_vcf_header_infos[annotation_field].source 3871 or "unknown" 3872 ) 3873 db_hdr_vcf_header_infos_version = ( 3874 db_hdr_vcf_header_infos[annotation_field].version 3875 or "unknown" 3876 ) 3877 3878 vcf_reader.infos[annotation_fields_new_name] = ( 3879 vcf.parser._Info( 3880 annotation_fields_new_name, 3881 db_hdr_vcf_header_infos_number, 3882 db_hdr_vcf_header_infos_type, 3883 db_hdr_vcf_header_infos_description, 3884 db_hdr_vcf_header_infos_source, 3885 db_hdr_vcf_header_infos_version, 3886 self.code_type_map[ 3887 db_hdr_vcf_header_infos_type 3888 ], 3889 ) 3890 ) 3891 3892 annotation_list.append(annotation_field) 3893 3894 nb_annotation_field += 1 3895 3896 else: 3897 3898 if ( 3899 annotation_field 3900 not in db_hdr_vcf.get_header().infos 3901 ): 3902 log.warning( 3903 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3904 ) 3905 if ( 3906 annotation_fields_new_name 3907 in self.get_header().infos 3908 ): 3909 log.warning( 3910 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3911 ) 3912 3913 log.info( 3914 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3915 ) 3916 3917 annotation_infos = ",".join(annotation_list) 3918 3919 if annotation_infos != "": 3920 3921 # Annotated VCF (and error file) 3922 tmp_annotation_vcf_name = os.path.join( 3923 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3924 ) 3925 tmp_annotation_vcf_name_err = ( 3926 tmp_annotation_vcf_name + ".err" 3927 ) 3928 3929 # Add fields to annotate 3930 if not annotation_fields_full: 3931 annotation_infos_option = f"-info {annotation_infos}" 3932 else: 3933 annotation_infos_option = "" 3934 3935 # Info fields rename 3936 if annotation_infos_rename_list: 3937 annotation_infos_rename = " -c " + ",".join( 3938 annotation_infos_rename_list 3939 ) 3940 else: 3941 annotation_infos_rename = "" 3942 3943 # Annotate command 3944 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3945 3946 # Add command 3947 commands[command_annotate] = tmp_annotation_vcf_name 3948 3949 if commands: 3950 3951 # Export VCF file 3952 self.export_variant_vcf( 3953 vcf_file=tmp_vcf_name, 3954 remove_info=True, 3955 add_samples=False, 3956 index=True, 3957 ) 3958 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3959 3960 # Num command 3961 nb_command = 0 3962 3963 # Annotate 3964 for command_annotate in commands: 3965 nb_command += 1 3966 log.info( 3967 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3968 ) 3969 log.debug(f"command_annotate={command_annotate}") 3970 run_parallel_commands([command_annotate], threads) 3971 3972 # Debug 3973 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3974 3975 # Update variants 3976 log.info( 3977 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3978 ) 3979 self.update_from_vcf(commands[command_annotate])
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
3981 def annotation_bcftools(self, threads: int = None) -> None: 3982 """ 3983 This function annotate with bcftools 3984 3985 :param threads: Number of threads to use 3986 :return: the value of the variable "return_value". 3987 """ 3988 3989 # DEBUG 3990 log.debug("Start annotation with bcftools databases") 3991 3992 # Threads 3993 if not threads: 3994 threads = self.get_threads() 3995 log.debug("Threads: " + str(threads)) 3996 3997 # Config 3998 config = self.get_config() 3999 log.debug("Config: " + str(config)) 4000 4001 # DEBUG 4002 delete_tmp = True 4003 if self.get_config().get("verbosity", "warning") in ["debug"]: 4004 delete_tmp = False 4005 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4006 4007 # Config - BCFTools bin command 4008 bcftools_bin_command = get_bin_command( 4009 bin="bcftools", 4010 tool="bcftools", 4011 bin_type="bin", 4012 config=config, 4013 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 4014 ) 4015 if not bcftools_bin_command: 4016 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 4017 log.error(msg_err) 4018 raise ValueError(msg_err) 4019 4020 # Config - BCFTools databases folders 4021 databases_folders = set( 4022 self.get_config() 4023 .get("folders", {}) 4024 .get("databases", {}) 4025 .get("annotations", ["."]) 4026 + self.get_config() 4027 .get("folders", {}) 4028 .get("databases", {}) 4029 .get("bcftools", ["."]) 4030 ) 4031 log.debug("Databases annotations: " + str(databases_folders)) 4032 4033 # Param 4034 annotations = ( 4035 self.get_param() 4036 .get("annotation", {}) 4037 .get("bcftools", {}) 4038 .get("annotations", None) 4039 ) 4040 log.debug("Annotations: " + str(annotations)) 4041 4042 # Assembly 4043 assembly = self.get_param().get( 4044 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 4045 ) 4046 4047 # Data 4048 table_variants = self.get_table_variants() 4049 4050 # Check if not empty 4051 log.debug("Check if not empty") 4052 sql_query_chromosomes = ( 4053 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4054 ) 4055 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 4056 if not sql_query_chromosomes_df["count"][0]: 4057 log.info(f"VCF empty") 4058 return 4059 4060 # Export in VCF 4061 log.debug("Create initial file to annotate") 4062 tmp_vcf = NamedTemporaryFile( 4063 prefix=self.get_prefix(), 4064 dir=self.get_tmp_dir(), 4065 suffix=".vcf.gz", 4066 delete=False, 4067 ) 4068 tmp_vcf_name = tmp_vcf.name 4069 4070 # VCF header 4071 vcf_reader = self.get_header() 4072 log.debug("Initial header: " + str(vcf_reader.infos)) 4073 4074 # Existing annotations 4075 for vcf_annotation in self.get_header().infos: 4076 4077 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4078 log.debug( 4079 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4080 ) 4081 4082 if annotations: 4083 4084 tmp_ann_vcf_list = [] 4085 commands = [] 4086 tmp_files = [] 4087 err_files = [] 4088 4089 for annotation in annotations: 4090 annotation_fields = annotations[annotation] 4091 4092 # Annotation Name 4093 annotation_name = os.path.basename(annotation) 4094 4095 if not annotation_fields: 4096 annotation_fields = {"INFO": None} 4097 4098 log.debug(f"Annotation '{annotation_name}'") 4099 log.debug( 4100 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 4101 ) 4102 4103 # Create Database 4104 database = Database( 4105 database=annotation, 4106 databases_folders=databases_folders, 4107 assembly=assembly, 4108 ) 4109 4110 # Find files 4111 db_file = database.get_database() 4112 db_file = full_path(db_file) 4113 db_hdr_file = database.get_header_file() 4114 db_hdr_file = full_path(db_hdr_file) 4115 db_file_type = database.get_format() 4116 db_tbi_file = f"{db_file}.tbi" 4117 db_file_compressed = database.is_compressed() 4118 4119 # Check if compressed 4120 if not db_file_compressed: 4121 log.error( 4122 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4123 ) 4124 raise ValueError( 4125 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4126 ) 4127 4128 # Check if indexed 4129 if not os.path.exists(db_tbi_file): 4130 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 4131 raise ValueError( 4132 f"Annotation '{annotation}' - {db_file} NOT indexed file" 4133 ) 4134 4135 # Check index - try to create if not exists 4136 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 4137 log.error("Annotation failed: database not valid") 4138 log.error(f"Annotation annotation file: {db_file}") 4139 log.error(f"Annotation annotation header: {db_hdr_file}") 4140 log.error(f"Annotation annotation index: {db_tbi_file}") 4141 raise ValueError( 4142 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 4143 ) 4144 else: 4145 4146 log.debug( 4147 f"Annotation '{annotation}' - file: " 4148 + str(db_file) 4149 + " and " 4150 + str(db_hdr_file) 4151 ) 4152 4153 # Load header as VCF object 4154 db_hdr_vcf = Variants(input=db_hdr_file) 4155 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 4156 log.debug( 4157 "Annotation database header: " + str(db_hdr_vcf_header_infos) 4158 ) 4159 4160 # For all fields in database 4161 if "ALL" in annotation_fields or "INFO" in annotation_fields: 4162 annotation_fields = { 4163 key: key for key in db_hdr_vcf_header_infos 4164 } 4165 log.debug( 4166 "Annotation database header - All annotations added: " 4167 + str(annotation_fields) 4168 ) 4169 4170 # Number of fields 4171 nb_annotation_field = 0 4172 annotation_list = [] 4173 4174 for annotation_field in annotation_fields: 4175 4176 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 4177 annotation_fields_new_name = annotation_fields.get( 4178 annotation_field, annotation_field 4179 ) 4180 if not annotation_fields_new_name: 4181 annotation_fields_new_name = annotation_field 4182 4183 # Check if field is in DB and if field is not elready in input data 4184 if ( 4185 annotation_field in db_hdr_vcf.get_header().infos 4186 and annotation_fields_new_name 4187 not in self.get_header().infos 4188 ): 4189 4190 log.info( 4191 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 4192 ) 4193 4194 # Add INFO field to header 4195 db_hdr_vcf_header_infos_number = ( 4196 db_hdr_vcf_header_infos[annotation_field].num or "." 4197 ) 4198 db_hdr_vcf_header_infos_type = ( 4199 db_hdr_vcf_header_infos[annotation_field].type 4200 or "String" 4201 ) 4202 db_hdr_vcf_header_infos_description = ( 4203 db_hdr_vcf_header_infos[annotation_field].desc 4204 or f"{annotation_field} description" 4205 ) 4206 db_hdr_vcf_header_infos_source = ( 4207 db_hdr_vcf_header_infos[annotation_field].source 4208 or "unknown" 4209 ) 4210 db_hdr_vcf_header_infos_version = ( 4211 db_hdr_vcf_header_infos[annotation_field].version 4212 or "unknown" 4213 ) 4214 4215 vcf_reader.infos[annotation_fields_new_name] = ( 4216 vcf.parser._Info( 4217 annotation_fields_new_name, 4218 db_hdr_vcf_header_infos_number, 4219 db_hdr_vcf_header_infos_type, 4220 db_hdr_vcf_header_infos_description, 4221 db_hdr_vcf_header_infos_source, 4222 db_hdr_vcf_header_infos_version, 4223 self.code_type_map[db_hdr_vcf_header_infos_type], 4224 ) 4225 ) 4226 4227 # annotation_list.append(annotation_field) 4228 if annotation_field != annotation_fields_new_name: 4229 annotation_list.append( 4230 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 4231 ) 4232 else: 4233 annotation_list.append(annotation_field) 4234 4235 nb_annotation_field += 1 4236 4237 else: 4238 4239 if annotation_field not in db_hdr_vcf.get_header().infos: 4240 log.warning( 4241 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 4242 ) 4243 if annotation_fields_new_name in self.get_header().infos: 4244 log.warning( 4245 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 4246 ) 4247 4248 log.info( 4249 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 4250 ) 4251 4252 annotation_infos = ",".join(annotation_list) 4253 4254 if annotation_infos != "": 4255 4256 # Protect header for bcftools (remove "#CHROM" and variants line) 4257 log.debug("Protect Header file - remove #CHROM line if exists") 4258 tmp_header_vcf = NamedTemporaryFile( 4259 prefix=self.get_prefix(), 4260 dir=self.get_tmp_dir(), 4261 suffix=".hdr", 4262 delete=False, 4263 ) 4264 tmp_header_vcf_name = tmp_header_vcf.name 4265 tmp_files.append(tmp_header_vcf_name) 4266 # Command 4267 if db_hdr_file.endswith(".gz"): 4268 command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4269 else: 4270 command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4271 # Run 4272 run_parallel_commands([command_extract_header], 1) 4273 4274 # Find chomosomes 4275 log.debug("Find chromosomes ") 4276 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 4277 sql_query_chromosomes_df = self.get_query_to_df( 4278 sql_query_chromosomes 4279 ) 4280 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 4281 4282 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 4283 4284 # BED columns in the annotation file 4285 if db_file_type in ["bed"]: 4286 annotation_infos = "CHROM,POS,POS," + annotation_infos 4287 4288 for chrom in chomosomes_list: 4289 4290 # Create BED on initial VCF 4291 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 4292 tmp_bed = NamedTemporaryFile( 4293 prefix=self.get_prefix(), 4294 dir=self.get_tmp_dir(), 4295 suffix=".bed", 4296 delete=False, 4297 ) 4298 tmp_bed_name = tmp_bed.name 4299 tmp_files.append(tmp_bed_name) 4300 4301 # Detecte regions 4302 log.debug( 4303 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 4304 ) 4305 window = 1000000 4306 sql_query_intervals_for_bed = f""" 4307 SELECT \"#CHROM\", 4308 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 4309 \"POS\"+{window} 4310 FROM {table_variants} as table_variants 4311 WHERE table_variants.\"#CHROM\" = '{chrom}' 4312 """ 4313 regions = self.conn.execute( 4314 sql_query_intervals_for_bed 4315 ).fetchall() 4316 merged_regions = merge_regions(regions) 4317 log.debug( 4318 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 4319 ) 4320 4321 header = ["#CHROM", "START", "END"] 4322 with open(tmp_bed_name, "w") as f: 4323 # Write the header with tab delimiter 4324 f.write("\t".join(header) + "\n") 4325 for d in merged_regions: 4326 # Write each data row with tab delimiter 4327 f.write("\t".join(map(str, d)) + "\n") 4328 4329 # Tmp files 4330 tmp_annotation_vcf = NamedTemporaryFile( 4331 prefix=self.get_prefix(), 4332 dir=self.get_tmp_dir(), 4333 suffix=".vcf.gz", 4334 delete=False, 4335 ) 4336 tmp_annotation_vcf_name = tmp_annotation_vcf.name 4337 tmp_files.append(tmp_annotation_vcf_name) 4338 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 4339 tmp_annotation_vcf_name_err = ( 4340 tmp_annotation_vcf_name + ".err" 4341 ) 4342 err_files.append(tmp_annotation_vcf_name_err) 4343 4344 # Annotate Command 4345 log.debug( 4346 f"Annotation '{annotation}' - add bcftools command" 4347 ) 4348 4349 # Command 4350 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 4351 4352 # Add command 4353 commands.append(command_annotate) 4354 4355 # if some commands 4356 if commands: 4357 4358 # Export VCF file 4359 self.export_variant_vcf( 4360 vcf_file=tmp_vcf_name, 4361 remove_info=True, 4362 add_samples=False, 4363 index=True, 4364 ) 4365 4366 # Threads 4367 # calculate threads for annotated commands 4368 if commands: 4369 threads_bcftools_annotate = round(threads / len(commands)) 4370 else: 4371 threads_bcftools_annotate = 1 4372 4373 if not threads_bcftools_annotate: 4374 threads_bcftools_annotate = 1 4375 4376 # Add threads option to bcftools commands 4377 if threads_bcftools_annotate > 1: 4378 commands_threaded = [] 4379 for command in commands: 4380 commands_threaded.append( 4381 command.replace( 4382 f"{bcftools_bin_command} annotate ", 4383 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 4384 ) 4385 ) 4386 commands = commands_threaded 4387 4388 # Command annotation multithreading 4389 log.debug(f"Annotation - Annotation commands: " + str(commands)) 4390 log.info( 4391 f"Annotation - Annotation multithreaded in " 4392 + str(len(commands)) 4393 + " commands" 4394 ) 4395 4396 run_parallel_commands(commands, threads) 4397 4398 # Merge 4399 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4400 4401 if tmp_ann_vcf_list_cmd: 4402 4403 # Tmp file 4404 tmp_annotate_vcf = NamedTemporaryFile( 4405 prefix=self.get_prefix(), 4406 dir=self.get_tmp_dir(), 4407 suffix=".vcf.gz", 4408 delete=True, 4409 ) 4410 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4411 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4412 err_files.append(tmp_annotate_vcf_name_err) 4413 4414 # Tmp file remove command 4415 tmp_files_remove_command = "" 4416 if tmp_files: 4417 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4418 4419 # Command merge 4420 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4421 log.info( 4422 f"Annotation - Annotation merging " 4423 + str(len(commands)) 4424 + " annotated files" 4425 ) 4426 log.debug(f"Annotation - merge command: {merge_command}") 4427 run_parallel_commands([merge_command], 1) 4428 4429 # Error messages 4430 log.info(f"Error/Warning messages:") 4431 error_message_command_all = [] 4432 error_message_command_warning = [] 4433 error_message_command_err = [] 4434 for err_file in err_files: 4435 with open(err_file, "r") as f: 4436 for line in f: 4437 message = line.strip() 4438 error_message_command_all.append(message) 4439 if line.startswith("[W::"): 4440 error_message_command_warning.append(message) 4441 if line.startswith("[E::"): 4442 error_message_command_err.append( 4443 f"{err_file}: " + message 4444 ) 4445 # log info 4446 for message in list( 4447 set(error_message_command_err + error_message_command_warning) 4448 ): 4449 log.info(f" {message}") 4450 # debug info 4451 for message in list(set(error_message_command_all)): 4452 log.debug(f" {message}") 4453 # failed 4454 if len(error_message_command_err): 4455 log.error("Annotation failed: Error in commands") 4456 raise ValueError("Annotation failed: Error in commands") 4457 4458 # Update variants 4459 log.info(f"Annotation - Updating...") 4460 self.update_from_vcf(tmp_annotate_vcf_name)
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
4462 def annotation_exomiser(self, threads: int = None) -> None: 4463 """ 4464 This function annotate with Exomiser 4465 4466 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4467 - "analysis" (dict/file): 4468 Full analysis dictionnary parameters (see Exomiser docs). 4469 Either a dict, or a file in JSON or YAML format. 4470 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4471 Default : None 4472 - "preset" (string): 4473 Analysis preset (available in config folder). 4474 Used if no full "analysis" is provided. 4475 Default: "exome" 4476 - "phenopacket" (dict/file): 4477 Samples and phenotipic features parameters (see Exomiser docs). 4478 Either a dict, or a file in JSON or YAML format. 4479 Default: None 4480 - "subject" (dict): 4481 Sample parameters (see Exomiser docs). 4482 Example: 4483 "subject": 4484 { 4485 "id": "ISDBM322017", 4486 "sex": "FEMALE" 4487 } 4488 Default: None 4489 - "sample" (string): 4490 Sample name to construct "subject" section: 4491 "subject": 4492 { 4493 "id": "<sample>", 4494 "sex": "UNKNOWN_SEX" 4495 } 4496 Default: None 4497 - "phenotypicFeatures" (dict) 4498 Phenotypic features to construct "subject" section. 4499 Example: 4500 "phenotypicFeatures": 4501 [ 4502 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4503 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4504 ] 4505 - "hpo" (list) 4506 List of HPO ids as phenotypic features. 4507 Example: 4508 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4509 Default: [] 4510 - "outputOptions" (dict): 4511 Output options (see Exomiser docs). 4512 Default: 4513 "output_options" = 4514 { 4515 "outputContributingVariantsOnly": False, 4516 "numGenes": 0, 4517 "outputFormats": ["TSV_VARIANT", "VCF"] 4518 } 4519 - "transcript_source" (string): 4520 Transcript source (either "refseq", "ucsc", "ensembl") 4521 Default: "refseq" 4522 - "exomiser_to_info" (boolean): 4523 Add exomiser TSV file columns as INFO fields in VCF. 4524 Default: False 4525 - "release" (string): 4526 Exomise database release. 4527 If not exists, database release will be downloaded (take a while). 4528 Default: None (provided by application.properties configuration file) 4529 - "exomiser_application_properties" (file): 4530 Exomiser configuration file (see Exomiser docs). 4531 Useful to automatically download databases (especially for specific genome databases). 4532 4533 Notes: 4534 - If no sample in parameters, first sample in VCF will be chosen 4535 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4536 4537 :param threads: The number of threads to use 4538 :return: None. 4539 """ 4540 4541 # DEBUG 4542 log.debug("Start annotation with Exomiser databases") 4543 4544 # Threads 4545 if not threads: 4546 threads = self.get_threads() 4547 log.debug("Threads: " + str(threads)) 4548 4549 # Config 4550 config = self.get_config() 4551 log.debug("Config: " + str(config)) 4552 4553 # Config - Folders - Databases 4554 databases_folders = ( 4555 config.get("folders", {}) 4556 .get("databases", {}) 4557 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4558 ) 4559 databases_folders = full_path(databases_folders) 4560 if not os.path.exists(databases_folders): 4561 log.error(f"Databases annotations: {databases_folders} NOT found") 4562 log.debug("Databases annotations: " + str(databases_folders)) 4563 4564 # Config - Exomiser 4565 exomiser_bin_command = get_bin_command( 4566 bin="exomiser-cli*.jar", 4567 tool="exomiser", 4568 bin_type="jar", 4569 config=config, 4570 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4571 ) 4572 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4573 if not exomiser_bin_command: 4574 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4575 log.error(msg_err) 4576 raise ValueError(msg_err) 4577 4578 # Param 4579 param = self.get_param() 4580 log.debug("Param: " + str(param)) 4581 4582 # Param - Exomiser 4583 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4584 log.debug(f"Param Exomiser: {param_exomiser}") 4585 4586 # Param - Assembly 4587 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4588 log.debug("Assembly: " + str(assembly)) 4589 4590 # Data 4591 table_variants = self.get_table_variants() 4592 4593 # Check if not empty 4594 log.debug("Check if not empty") 4595 sql_query_chromosomes = ( 4596 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4597 ) 4598 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4599 log.info(f"VCF empty") 4600 return False 4601 4602 # VCF header 4603 vcf_reader = self.get_header() 4604 log.debug("Initial header: " + str(vcf_reader.infos)) 4605 4606 # Samples 4607 samples = self.get_header_sample_list() 4608 if not samples: 4609 log.error("No Samples in VCF") 4610 return False 4611 log.debug(f"Samples: {samples}") 4612 4613 # Memory limit 4614 memory_limit = self.get_memory("8G") 4615 log.debug(f"memory_limit: {memory_limit}") 4616 4617 # Exomiser java options 4618 exomiser_java_options = ( 4619 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4620 ) 4621 log.debug(f"Exomiser java options: {exomiser_java_options}") 4622 4623 # Download Exomiser (if not exists) 4624 exomiser_release = param_exomiser.get("release", None) 4625 exomiser_application_properties = param_exomiser.get( 4626 "exomiser_application_properties", None 4627 ) 4628 databases_download_exomiser( 4629 assemblies=[assembly], 4630 exomiser_folder=databases_folders, 4631 exomiser_release=exomiser_release, 4632 exomiser_phenotype_release=exomiser_release, 4633 exomiser_application_properties=exomiser_application_properties, 4634 ) 4635 4636 # Force annotation 4637 force_update_annotation = True 4638 4639 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4640 log.debug("Start annotation Exomiser") 4641 4642 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4643 4644 # tmp_dir = "/tmp/exomiser" 4645 4646 ### ANALYSIS ### 4647 ################ 4648 4649 # Create analysis.json through analysis dict 4650 # either analysis in param or by default 4651 # depending on preset exome/genome) 4652 4653 # Init analysis dict 4654 param_exomiser_analysis_dict = {} 4655 4656 # analysis from param 4657 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4658 param_exomiser_analysis = full_path(param_exomiser_analysis) 4659 4660 # If analysis in param -> load anlaysis json 4661 if param_exomiser_analysis: 4662 4663 # If param analysis is a file and exists 4664 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4665 param_exomiser_analysis 4666 ): 4667 # Load analysis file into analysis dict (either yaml or json) 4668 with open(param_exomiser_analysis) as json_file: 4669 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4670 4671 # If param analysis is a dict 4672 elif isinstance(param_exomiser_analysis, dict): 4673 # Load analysis dict into analysis dict (either yaml or json) 4674 param_exomiser_analysis_dict = param_exomiser_analysis 4675 4676 # Error analysis type 4677 else: 4678 log.error(f"Analysis type unknown. Check param file.") 4679 raise ValueError(f"Analysis type unknown. Check param file.") 4680 4681 # Case no input analysis config file/dict 4682 # Use preset (exome/genome) to open default config file 4683 if not param_exomiser_analysis_dict: 4684 4685 # default preset 4686 default_preset = "exome" 4687 4688 # Get param preset or default preset 4689 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4690 4691 # Try to find if preset is a file 4692 if os.path.exists(param_exomiser_preset): 4693 # Preset file is provided in full path 4694 param_exomiser_analysis_default_config_file = ( 4695 param_exomiser_preset 4696 ) 4697 # elif os.path.exists(full_path(param_exomiser_preset)): 4698 # # Preset file is provided in full path 4699 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4700 elif os.path.exists( 4701 os.path.join(folder_config, param_exomiser_preset) 4702 ): 4703 # Preset file is provided a basename in config folder (can be a path with subfolders) 4704 param_exomiser_analysis_default_config_file = os.path.join( 4705 folder_config, param_exomiser_preset 4706 ) 4707 else: 4708 # Construct preset file 4709 param_exomiser_analysis_default_config_file = os.path.join( 4710 folder_config, 4711 f"preset-{param_exomiser_preset}-analysis.json", 4712 ) 4713 4714 # If preset file exists 4715 param_exomiser_analysis_default_config_file = full_path( 4716 param_exomiser_analysis_default_config_file 4717 ) 4718 if os.path.exists(param_exomiser_analysis_default_config_file): 4719 # Load prest file into analysis dict (either yaml or json) 4720 with open( 4721 param_exomiser_analysis_default_config_file 4722 ) as json_file: 4723 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4724 json_file 4725 ) 4726 4727 # Error preset file 4728 else: 4729 log.error( 4730 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4731 ) 4732 raise ValueError( 4733 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4734 ) 4735 4736 # If no analysis dict created 4737 if not param_exomiser_analysis_dict: 4738 log.error(f"No analysis config") 4739 raise ValueError(f"No analysis config") 4740 4741 # Log 4742 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4743 4744 ### PHENOPACKET ### 4745 ################### 4746 4747 # If no PhenoPacket in analysis dict -> check in param 4748 if "phenopacket" not in param_exomiser_analysis_dict: 4749 4750 # If PhenoPacket in param -> load anlaysis json 4751 if param_exomiser.get("phenopacket", None): 4752 4753 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4754 param_exomiser_phenopacket = full_path( 4755 param_exomiser_phenopacket 4756 ) 4757 4758 # If param phenopacket is a file and exists 4759 if isinstance( 4760 param_exomiser_phenopacket, str 4761 ) and os.path.exists(param_exomiser_phenopacket): 4762 # Load phenopacket file into analysis dict (either yaml or json) 4763 with open(param_exomiser_phenopacket) as json_file: 4764 param_exomiser_analysis_dict["phenopacket"] = ( 4765 yaml.safe_load(json_file) 4766 ) 4767 4768 # If param phenopacket is a dict 4769 elif isinstance(param_exomiser_phenopacket, dict): 4770 # Load phenopacket dict into analysis dict (either yaml or json) 4771 param_exomiser_analysis_dict["phenopacket"] = ( 4772 param_exomiser_phenopacket 4773 ) 4774 4775 # Error phenopacket type 4776 else: 4777 log.error(f"Phenopacket type unknown. Check param file.") 4778 raise ValueError( 4779 f"Phenopacket type unknown. Check param file." 4780 ) 4781 4782 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4783 if "phenopacket" not in param_exomiser_analysis_dict: 4784 4785 # Init PhenoPacket 4786 param_exomiser_analysis_dict["phenopacket"] = { 4787 "id": "analysis", 4788 "proband": {}, 4789 } 4790 4791 ### Add subject ### 4792 4793 # If subject exists 4794 param_exomiser_subject = param_exomiser.get("subject", {}) 4795 4796 # If subject not exists -> found sample ID 4797 if not param_exomiser_subject: 4798 4799 # Found sample ID in param 4800 sample = param_exomiser.get("sample", None) 4801 4802 # Find sample ID (first sample) 4803 if not sample: 4804 sample_list = self.get_header_sample_list() 4805 if len(sample_list) > 0: 4806 sample = sample_list[0] 4807 else: 4808 log.error(f"No sample found") 4809 raise ValueError(f"No sample found") 4810 4811 # Create subject 4812 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4813 4814 # Add to dict 4815 param_exomiser_analysis_dict["phenopacket"][ 4816 "subject" 4817 ] = param_exomiser_subject 4818 4819 ### Add "phenotypicFeatures" ### 4820 4821 # If phenotypicFeatures exists 4822 param_exomiser_phenotypicfeatures = param_exomiser.get( 4823 "phenotypicFeatures", [] 4824 ) 4825 4826 # If phenotypicFeatures not exists -> Try to infer from hpo list 4827 if not param_exomiser_phenotypicfeatures: 4828 4829 # Found HPO in param 4830 param_exomiser_hpo = param_exomiser.get("hpo", []) 4831 4832 # Split HPO if list in string format separated by comma 4833 if isinstance(param_exomiser_hpo, str): 4834 param_exomiser_hpo = param_exomiser_hpo.split(",") 4835 4836 # Create HPO list 4837 for hpo in param_exomiser_hpo: 4838 hpo_clean = re.sub("[^0-9]", "", hpo) 4839 param_exomiser_phenotypicfeatures.append( 4840 { 4841 "type": { 4842 "id": f"HP:{hpo_clean}", 4843 "label": f"HP:{hpo_clean}", 4844 } 4845 } 4846 ) 4847 4848 # Add to dict 4849 param_exomiser_analysis_dict["phenopacket"][ 4850 "phenotypicFeatures" 4851 ] = param_exomiser_phenotypicfeatures 4852 4853 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4854 if not param_exomiser_phenotypicfeatures: 4855 for step in param_exomiser_analysis_dict.get( 4856 "analysis", {} 4857 ).get("steps", []): 4858 if "hiPhivePrioritiser" in step: 4859 param_exomiser_analysis_dict.get("analysis", {}).get( 4860 "steps", [] 4861 ).remove(step) 4862 4863 ### Add Input File ### 4864 4865 # Initial file name and htsFiles 4866 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4867 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4868 { 4869 "uri": tmp_vcf_name, 4870 "htsFormat": "VCF", 4871 "genomeAssembly": assembly, 4872 } 4873 ] 4874 4875 ### Add metaData ### 4876 4877 # If metaData not in analysis dict 4878 if "metaData" not in param_exomiser_analysis_dict: 4879 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4880 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4881 "createdBy": "howard", 4882 "phenopacketSchemaVersion": 1, 4883 } 4884 4885 ### OutputOptions ### 4886 4887 # Init output result folder 4888 output_results = os.path.join(tmp_dir, "results") 4889 4890 # If no outputOptions in analysis dict 4891 if "outputOptions" not in param_exomiser_analysis_dict: 4892 4893 # default output formats 4894 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4895 4896 # Get outputOptions in param 4897 output_options = param_exomiser.get("outputOptions", None) 4898 4899 # If no output_options in param -> check 4900 if not output_options: 4901 output_options = { 4902 "outputContributingVariantsOnly": False, 4903 "numGenes": 0, 4904 "outputFormats": defaut_output_formats, 4905 } 4906 4907 # Replace outputDirectory in output options 4908 output_options["outputDirectory"] = output_results 4909 output_options["outputFileName"] = "howard" 4910 4911 # Add outputOptions in analysis dict 4912 param_exomiser_analysis_dict["outputOptions"] = output_options 4913 4914 else: 4915 4916 # Replace output_results and output format (if exists in param) 4917 param_exomiser_analysis_dict["outputOptions"][ 4918 "outputDirectory" 4919 ] = output_results 4920 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4921 list( 4922 set( 4923 param_exomiser_analysis_dict.get( 4924 "outputOptions", {} 4925 ).get("outputFormats", []) 4926 + ["TSV_VARIANT", "VCF"] 4927 ) 4928 ) 4929 ) 4930 4931 # log 4932 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4933 4934 ### ANALYSIS FILE ### 4935 ##################### 4936 4937 ### Full JSON analysis config file ### 4938 4939 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4940 with open(exomiser_analysis, "w") as fp: 4941 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4942 4943 ### SPLIT analysis and sample config files 4944 4945 # Splitted analysis dict 4946 param_exomiser_analysis_dict_for_split = ( 4947 param_exomiser_analysis_dict.copy() 4948 ) 4949 4950 # Phenopacket JSON file 4951 exomiser_analysis_phenopacket = os.path.join( 4952 tmp_dir, "analysis_phenopacket.json" 4953 ) 4954 with open(exomiser_analysis_phenopacket, "w") as fp: 4955 json.dump( 4956 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4957 fp, 4958 indent=4, 4959 ) 4960 4961 # Analysis JSON file without Phenopacket parameters 4962 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4963 exomiser_analysis_analysis = os.path.join( 4964 tmp_dir, "analysis_analysis.json" 4965 ) 4966 with open(exomiser_analysis_analysis, "w") as fp: 4967 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4968 4969 ### INITAL VCF file ### 4970 ####################### 4971 4972 ### Create list of samples to use and include inti initial VCF file #### 4973 4974 # Subject (main sample) 4975 # Get sample ID in analysis dict 4976 sample_subject = ( 4977 param_exomiser_analysis_dict.get("phenopacket", {}) 4978 .get("subject", {}) 4979 .get("id", None) 4980 ) 4981 sample_proband = ( 4982 param_exomiser_analysis_dict.get("phenopacket", {}) 4983 .get("proband", {}) 4984 .get("subject", {}) 4985 .get("id", None) 4986 ) 4987 sample = [] 4988 if sample_subject: 4989 sample.append(sample_subject) 4990 if sample_proband: 4991 sample.append(sample_proband) 4992 4993 # Get sample ID within Pedigree 4994 pedigree_persons_list = ( 4995 param_exomiser_analysis_dict.get("phenopacket", {}) 4996 .get("pedigree", {}) 4997 .get("persons", {}) 4998 ) 4999 5000 # Create list with all sample ID in pedigree (if exists) 5001 pedigree_persons = [] 5002 for person in pedigree_persons_list: 5003 pedigree_persons.append(person.get("individualId")) 5004 5005 # Concat subject sample ID and samples ID in pedigreesamples 5006 samples = list(set(sample + pedigree_persons)) 5007 5008 # Check if sample list is not empty 5009 if not samples: 5010 log.error(f"No samples found") 5011 raise ValueError(f"No samples found") 5012 5013 # Create VCF with sample (either sample in param or first one by default) 5014 # Export VCF file 5015 self.export_variant_vcf( 5016 vcf_file=tmp_vcf_name, 5017 remove_info=True, 5018 add_samples=True, 5019 list_samples=samples, 5020 index=False, 5021 ) 5022 5023 ### Execute Exomiser ### 5024 ######################## 5025 5026 # Init command 5027 exomiser_command = "" 5028 5029 # Command exomiser options 5030 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 5031 5032 # Release 5033 exomiser_release = param_exomiser.get("release", None) 5034 if exomiser_release: 5035 # phenotype data version 5036 exomiser_options += ( 5037 f" --exomiser.phenotype.data-version={exomiser_release} " 5038 ) 5039 # data version 5040 exomiser_options += ( 5041 f" --exomiser.{assembly}.data-version={exomiser_release} " 5042 ) 5043 # variant white list 5044 variant_white_list_file = ( 5045 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 5046 ) 5047 if os.path.exists( 5048 os.path.join( 5049 databases_folders, assembly, variant_white_list_file 5050 ) 5051 ): 5052 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 5053 5054 # transcript_source 5055 transcript_source = param_exomiser.get( 5056 "transcript_source", None 5057 ) # ucsc, refseq, ensembl 5058 if transcript_source: 5059 exomiser_options += ( 5060 f" --exomiser.{assembly}.transcript-source={transcript_source} " 5061 ) 5062 5063 # If analysis contain proband param 5064 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 5065 "proband", {} 5066 ): 5067 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 5068 5069 # If no proband (usually uniq sample) 5070 else: 5071 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 5072 5073 # Log 5074 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 5075 5076 # Run command 5077 result = subprocess.call( 5078 exomiser_command_analysis.split(), stdout=subprocess.PIPE 5079 ) 5080 if result: 5081 log.error("Exomiser command failed") 5082 raise ValueError("Exomiser command failed") 5083 5084 ### RESULTS ### 5085 ############### 5086 5087 ### Annotate with TSV fields ### 5088 5089 # Init result tsv file 5090 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 5091 5092 # Init result tsv file 5093 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 5094 5095 # Parse TSV file and explode columns in INFO field 5096 if exomiser_to_info and os.path.exists(output_results_tsv): 5097 5098 # Log 5099 log.debug("Exomiser columns to VCF INFO field") 5100 5101 # Retrieve columns and types 5102 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 5103 output_results_tsv_df = self.get_query_to_df(query) 5104 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 5105 5106 # Init concat fields for update 5107 sql_query_update_concat_fields = [] 5108 5109 # Fields to avoid 5110 fields_to_avoid = [ 5111 "CONTIG", 5112 "START", 5113 "END", 5114 "REF", 5115 "ALT", 5116 "QUAL", 5117 "FILTER", 5118 "GENOTYPE", 5119 ] 5120 5121 # List all columns to add into header 5122 for header_column in output_results_tsv_columns: 5123 5124 # If header column is enable 5125 if header_column not in fields_to_avoid: 5126 5127 # Header info type 5128 header_info_type = "String" 5129 header_column_df = output_results_tsv_df[header_column] 5130 header_column_df_dtype = header_column_df.dtype 5131 if header_column_df_dtype == object: 5132 if ( 5133 pd.to_numeric(header_column_df, errors="coerce") 5134 .notnull() 5135 .all() 5136 ): 5137 header_info_type = "Float" 5138 else: 5139 header_info_type = "Integer" 5140 5141 # Header info 5142 characters_to_validate = ["-"] 5143 pattern = "[" + "".join(characters_to_validate) + "]" 5144 header_info_name = re.sub( 5145 pattern, 5146 "_", 5147 f"Exomiser_{header_column}".replace("#", ""), 5148 ) 5149 header_info_number = "." 5150 header_info_description = ( 5151 f"Exomiser {header_column} annotation" 5152 ) 5153 header_info_source = "Exomiser" 5154 header_info_version = "unknown" 5155 header_info_code = CODE_TYPE_MAP[header_info_type] 5156 vcf_reader.infos[header_info_name] = vcf.parser._Info( 5157 header_info_name, 5158 header_info_number, 5159 header_info_type, 5160 header_info_description, 5161 header_info_source, 5162 header_info_version, 5163 header_info_code, 5164 ) 5165 5166 # Add field to add for update to concat fields 5167 sql_query_update_concat_fields.append( 5168 f""" 5169 CASE 5170 WHEN table_parquet."{header_column}" NOT IN ('','.') 5171 THEN concat( 5172 '{header_info_name}=', 5173 table_parquet."{header_column}", 5174 ';' 5175 ) 5176 5177 ELSE '' 5178 END 5179 """ 5180 ) 5181 5182 # Update query 5183 sql_query_update = f""" 5184 UPDATE {table_variants} as table_variants 5185 SET INFO = concat( 5186 CASE 5187 WHEN INFO NOT IN ('', '.') 5188 THEN INFO 5189 ELSE '' 5190 END, 5191 CASE 5192 WHEN table_variants.INFO NOT IN ('','.') 5193 THEN ';' 5194 ELSE '' 5195 END, 5196 ( 5197 SELECT 5198 concat( 5199 {",".join(sql_query_update_concat_fields)} 5200 ) 5201 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 5202 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 5203 AND table_parquet.\"START\" = table_variants.\"POS\" 5204 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5205 AND table_parquet.\"REF\" = table_variants.\"REF\" 5206 ) 5207 ) 5208 ; 5209 """ 5210 5211 # Update 5212 self.conn.execute(sql_query_update) 5213 5214 ### Annotate with VCF INFO field ### 5215 5216 # Init result VCF file 5217 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 5218 5219 # If VCF exists 5220 if os.path.exists(output_results_vcf): 5221 5222 # Log 5223 log.debug("Exomiser result VCF update variants") 5224 5225 # Find Exomiser INFO field annotation in header 5226 with gzip.open(output_results_vcf, "rt") as f: 5227 header_list = self.read_vcf_header(f) 5228 exomiser_vcf_header = vcf.Reader( 5229 io.StringIO("\n".join(header_list)) 5230 ) 5231 5232 # Add annotation INFO field to header 5233 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 5234 5235 # Update variants with VCF 5236 self.update_from_vcf(output_results_vcf) 5237 5238 return True
This function annotate with Exomiser
This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
- "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
- "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
- "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
- "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
- "sample" (string):
Sample name to construct "subject" section:
"subject":
{
"id": "
", "sex": "UNKNOWN_SEX" } Default: None - "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
- "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
- "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
- "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
- "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
- "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
- "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).
Notes:
- If no sample in parameters, first sample in VCF will be chosen
- If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
- threads: The number of threads to use
Returns
None.
5240 def annotation_snpeff(self, threads: int = None) -> None: 5241 """ 5242 This function annotate with snpEff 5243 5244 :param threads: The number of threads to use 5245 :return: the value of the variable "return_value". 5246 """ 5247 5248 # DEBUG 5249 log.debug("Start annotation with snpeff databases") 5250 5251 # Threads 5252 if not threads: 5253 threads = self.get_threads() 5254 log.debug("Threads: " + str(threads)) 5255 5256 # DEBUG 5257 delete_tmp = True 5258 if self.get_config().get("verbosity", "warning") in ["debug"]: 5259 delete_tmp = False 5260 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5261 5262 # Config 5263 config = self.get_config() 5264 log.debug("Config: " + str(config)) 5265 5266 # Config - Folders - Databases 5267 databases_folders = ( 5268 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 5269 ) 5270 log.debug("Databases annotations: " + str(databases_folders)) 5271 5272 # Config - snpEff bin command 5273 snpeff_bin_command = get_bin_command( 5274 bin="snpEff.jar", 5275 tool="snpeff", 5276 bin_type="jar", 5277 config=config, 5278 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 5279 ) 5280 if not snpeff_bin_command: 5281 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 5282 log.error(msg_err) 5283 raise ValueError(msg_err) 5284 5285 # Config - snpEff databases 5286 snpeff_databases = ( 5287 config.get("folders", {}) 5288 .get("databases", {}) 5289 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 5290 ) 5291 snpeff_databases = full_path(snpeff_databases) 5292 if snpeff_databases is not None and snpeff_databases != "": 5293 log.debug(f"Create snpEff databases folder") 5294 if not os.path.exists(snpeff_databases): 5295 os.makedirs(snpeff_databases) 5296 5297 # Param 5298 param = self.get_param() 5299 log.debug("Param: " + str(param)) 5300 5301 # Param 5302 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 5303 log.debug("Options: " + str(options)) 5304 5305 # Param - Assembly 5306 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5307 5308 # Param - Options 5309 snpeff_options = ( 5310 param.get("annotation", {}).get("snpeff", {}).get("options", "") 5311 ) 5312 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 5313 snpeff_csvstats = ( 5314 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 5315 ) 5316 if snpeff_stats: 5317 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 5318 snpeff_stats = full_path(snpeff_stats) 5319 snpeff_options += f" -stats {snpeff_stats}" 5320 if snpeff_csvstats: 5321 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 5322 snpeff_csvstats = full_path(snpeff_csvstats) 5323 snpeff_options += f" -csvStats {snpeff_csvstats}" 5324 5325 # Data 5326 table_variants = self.get_table_variants() 5327 5328 # Check if not empty 5329 log.debug("Check if not empty") 5330 sql_query_chromosomes = ( 5331 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5332 ) 5333 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 5334 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5335 log.info(f"VCF empty") 5336 return 5337 5338 # Export in VCF 5339 log.debug("Create initial file to annotate") 5340 tmp_vcf = NamedTemporaryFile( 5341 prefix=self.get_prefix(), 5342 dir=self.get_tmp_dir(), 5343 suffix=".vcf.gz", 5344 delete=True, 5345 ) 5346 tmp_vcf_name = tmp_vcf.name 5347 5348 # VCF header 5349 vcf_reader = self.get_header() 5350 log.debug("Initial header: " + str(vcf_reader.infos)) 5351 5352 # Existing annotations 5353 for vcf_annotation in self.get_header().infos: 5354 5355 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5356 log.debug( 5357 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5358 ) 5359 5360 # Memory limit 5361 # if config.get("memory", None): 5362 # memory_limit = config.get("memory", "8G") 5363 # else: 5364 # memory_limit = "8G" 5365 memory_limit = self.get_memory("8G") 5366 log.debug(f"memory_limit: {memory_limit}") 5367 5368 # snpEff java options 5369 snpeff_java_options = ( 5370 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5371 ) 5372 log.debug(f"Exomiser java options: {snpeff_java_options}") 5373 5374 force_update_annotation = True 5375 5376 if "ANN" not in self.get_header().infos or force_update_annotation: 5377 5378 # Check snpEff database 5379 log.debug(f"Check snpEff databases {[assembly]}") 5380 databases_download_snpeff( 5381 folder=snpeff_databases, assemblies=[assembly], config=config 5382 ) 5383 5384 # Export VCF file 5385 self.export_variant_vcf( 5386 vcf_file=tmp_vcf_name, 5387 remove_info=True, 5388 add_samples=False, 5389 index=True, 5390 ) 5391 5392 # Tmp file 5393 err_files = [] 5394 tmp_annotate_vcf = NamedTemporaryFile( 5395 prefix=self.get_prefix(), 5396 dir=self.get_tmp_dir(), 5397 suffix=".vcf", 5398 delete=False, 5399 ) 5400 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5401 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5402 err_files.append(tmp_annotate_vcf_name_err) 5403 5404 # Command 5405 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5406 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5407 run_parallel_commands([snpeff_command], 1) 5408 5409 # Error messages 5410 log.info(f"Error/Warning messages:") 5411 error_message_command_all = [] 5412 error_message_command_warning = [] 5413 error_message_command_err = [] 5414 for err_file in err_files: 5415 with open(err_file, "r") as f: 5416 for line in f: 5417 message = line.strip() 5418 error_message_command_all.append(message) 5419 if line.startswith("[W::"): 5420 error_message_command_warning.append(message) 5421 if line.startswith("[E::"): 5422 error_message_command_err.append(f"{err_file}: " + message) 5423 # log info 5424 for message in list( 5425 set(error_message_command_err + error_message_command_warning) 5426 ): 5427 log.info(f" {message}") 5428 # debug info 5429 for message in list(set(error_message_command_all)): 5430 log.debug(f" {message}") 5431 # failed 5432 if len(error_message_command_err): 5433 log.error("Annotation failed: Error in commands") 5434 raise ValueError("Annotation failed: Error in commands") 5435 5436 # Find annotation in header 5437 with open(tmp_annotate_vcf_name, "rt") as f: 5438 header_list = self.read_vcf_header(f) 5439 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5440 5441 for ann in annovar_vcf_header.infos: 5442 if ann not in self.get_header().infos: 5443 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5444 5445 # Update variants 5446 log.info(f"Annotation - Updating...") 5447 self.update_from_vcf(tmp_annotate_vcf_name) 5448 5449 else: 5450 if "ANN" in self.get_header().infos: 5451 log.debug(f"Existing snpEff annotations in VCF") 5452 if force_update_annotation: 5453 log.debug(f"Existing snpEff annotations in VCF - annotation forced")
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
5455 def annotation_annovar(self, threads: int = None) -> None: 5456 """ 5457 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5458 annotations 5459 5460 :param threads: number of threads to use 5461 :return: the value of the variable "return_value". 5462 """ 5463 5464 # DEBUG 5465 log.debug("Start annotation with Annovar databases") 5466 5467 # Threads 5468 if not threads: 5469 threads = self.get_threads() 5470 log.debug("Threads: " + str(threads)) 5471 5472 # Tmp en Err files 5473 tmp_files = [] 5474 err_files = [] 5475 5476 # DEBUG 5477 delete_tmp = True 5478 if self.get_config().get("verbosity", "warning") in ["debug"]: 5479 delete_tmp = False 5480 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5481 5482 # Config 5483 config = self.get_config() 5484 log.debug("Config: " + str(config)) 5485 5486 # Config - Folders - Databases 5487 databases_folders = ( 5488 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5489 ) 5490 log.debug("Databases annotations: " + str(databases_folders)) 5491 5492 # Config - annovar bin command 5493 annovar_bin_command = get_bin_command( 5494 bin="table_annovar.pl", 5495 tool="annovar", 5496 bin_type="perl", 5497 config=config, 5498 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5499 ) 5500 if not annovar_bin_command: 5501 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5502 log.error(msg_err) 5503 raise ValueError(msg_err) 5504 5505 # Config - BCFTools bin command 5506 bcftools_bin_command = get_bin_command( 5507 bin="bcftools", 5508 tool="bcftools", 5509 bin_type="bin", 5510 config=config, 5511 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5512 ) 5513 if not bcftools_bin_command: 5514 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5515 log.error(msg_err) 5516 raise ValueError(msg_err) 5517 5518 # Config - annovar databases 5519 annovar_databases = ( 5520 config.get("folders", {}) 5521 .get("databases", {}) 5522 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5523 ) 5524 if annovar_databases is not None: 5525 if isinstance(annovar_databases, list): 5526 annovar_databases = full_path(annovar_databases[0]) 5527 log.warning(f"Annovar databases folder '{annovar_databases}' selected") 5528 annovar_databases = full_path(annovar_databases) 5529 if not os.path.exists(annovar_databases): 5530 log.info(f"Annovar databases folder '{annovar_databases}' created") 5531 Path(annovar_databases).mkdir(parents=True, exist_ok=True) 5532 else: 5533 msg_err = f"Annovar databases configuration failed" 5534 log.error(msg_err) 5535 raise ValueError(msg_err) 5536 5537 # Param 5538 param = self.get_param() 5539 log.debug("Param: " + str(param)) 5540 5541 # Param - options 5542 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5543 log.debug("Options: " + str(options)) 5544 5545 # Param - annotations 5546 annotations = ( 5547 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5548 ) 5549 log.debug("Annotations: " + str(annotations)) 5550 5551 # Param - Assembly 5552 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5553 5554 # Annovar database assembly 5555 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5556 if annovar_databases_assembly != "" and not os.path.exists( 5557 annovar_databases_assembly 5558 ): 5559 os.makedirs(annovar_databases_assembly) 5560 5561 # Data 5562 table_variants = self.get_table_variants() 5563 5564 # Check if not empty 5565 log.debug("Check if not empty") 5566 sql_query_chromosomes = ( 5567 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5568 ) 5569 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5570 if not sql_query_chromosomes_df["count"][0]: 5571 log.info(f"VCF empty") 5572 return 5573 5574 # VCF header 5575 vcf_reader = self.get_header() 5576 log.debug("Initial header: " + str(vcf_reader.infos)) 5577 5578 # Existing annotations 5579 for vcf_annotation in self.get_header().infos: 5580 5581 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5582 log.debug( 5583 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5584 ) 5585 5586 force_update_annotation = True 5587 5588 if annotations: 5589 5590 commands = [] 5591 tmp_annotates_vcf_name_list = [] 5592 5593 # Export in VCF 5594 log.debug("Create initial file to annotate") 5595 tmp_vcf = NamedTemporaryFile( 5596 prefix=self.get_prefix(), 5597 dir=self.get_tmp_dir(), 5598 suffix=".vcf.gz", 5599 delete=False, 5600 ) 5601 tmp_vcf_name = tmp_vcf.name 5602 tmp_files.append(tmp_vcf_name) 5603 tmp_files.append(tmp_vcf_name + ".tbi") 5604 5605 # Export VCF file 5606 self.export_variant_vcf( 5607 vcf_file=tmp_vcf_name, 5608 remove_info=".", 5609 add_samples=False, 5610 index=True, 5611 ) 5612 5613 # Create file for field rename 5614 log.debug("Create file for field rename") 5615 tmp_rename = NamedTemporaryFile( 5616 prefix=self.get_prefix(), 5617 dir=self.get_tmp_dir(), 5618 suffix=".rename", 5619 delete=False, 5620 ) 5621 tmp_rename_name = tmp_rename.name 5622 tmp_files.append(tmp_rename_name) 5623 5624 # Check Annovar database 5625 log.debug( 5626 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5627 ) 5628 databases_download_annovar( 5629 folder=annovar_databases, 5630 files=list(annotations.keys()), 5631 assemblies=[assembly], 5632 ) 5633 5634 for annotation in annotations: 5635 annotation_fields = annotations[annotation] 5636 5637 if not annotation_fields: 5638 annotation_fields = {"INFO": None} 5639 5640 log.info(f"Annotations Annovar - database '{annotation}'") 5641 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5642 5643 # Tmp file for annovar 5644 err_files = [] 5645 tmp_annotate_vcf_directory = TemporaryDirectory( 5646 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5647 ) 5648 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5649 tmp_annotate_vcf_name_annovar = ( 5650 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5651 ) 5652 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5653 err_files.append(tmp_annotate_vcf_name_err) 5654 tmp_files.append(tmp_annotate_vcf_name_err) 5655 5656 # Tmp file final vcf annotated by annovar 5657 tmp_annotate_vcf = NamedTemporaryFile( 5658 prefix=self.get_prefix(), 5659 dir=self.get_tmp_dir(), 5660 suffix=".vcf.gz", 5661 delete=False, 5662 ) 5663 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5664 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5665 tmp_files.append(tmp_annotate_vcf_name) 5666 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5667 5668 # Number of fields 5669 annotation_list = [] 5670 annotation_renamed_list = [] 5671 5672 for annotation_field in annotation_fields: 5673 5674 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5675 annotation_fields_new_name = annotation_fields.get( 5676 annotation_field, annotation_field 5677 ) 5678 if not annotation_fields_new_name: 5679 annotation_fields_new_name = annotation_field 5680 5681 if ( 5682 force_update_annotation 5683 or annotation_fields_new_name not in self.get_header().infos 5684 ): 5685 annotation_list.append(annotation_field) 5686 annotation_renamed_list.append(annotation_fields_new_name) 5687 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5688 log.warning( 5689 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5690 ) 5691 5692 # Add rename info 5693 run_parallel_commands( 5694 [ 5695 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5696 ], 5697 1, 5698 ) 5699 5700 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5701 log.debug("annotation_list: " + str(annotation_list)) 5702 5703 # protocol 5704 protocol = annotation 5705 5706 # argument 5707 argument = "" 5708 5709 # operation 5710 operation = "f" 5711 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5712 "ensGene" 5713 ): 5714 operation = "g" 5715 if options.get("genebase", None): 5716 argument = f"""'{options.get("genebase","")}'""" 5717 elif annotation in ["cytoBand"]: 5718 operation = "r" 5719 5720 # argument option 5721 argument_option = "" 5722 if argument != "": 5723 argument_option = " --argument " + argument 5724 5725 # command options 5726 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5727 for option in options: 5728 if option not in ["genebase"]: 5729 command_options += f""" --{option}={options[option]}""" 5730 5731 # Command 5732 5733 # Command - Annovar 5734 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5735 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5736 5737 # Command - start pipe 5738 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5739 5740 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5741 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5742 5743 # Command - Special characters (refGene annotation) 5744 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5745 5746 # Command - Clean empty fields (with value ".") 5747 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5748 5749 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5750 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5751 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5752 # for ann in annotation_renamed_list: 5753 for ann in annotation_list: 5754 annovar_fields_to_keep.append(f"^INFO/{ann}") 5755 5756 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5757 5758 # Command - indexing 5759 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5760 5761 log.debug(f"Annotation - Annovar command: {command_annovar}") 5762 run_parallel_commands([command_annovar], 1) 5763 5764 # Error messages 5765 log.info(f"Error/Warning messages:") 5766 error_message_command_all = [] 5767 error_message_command_warning = [] 5768 error_message_command_err = [] 5769 for err_file in err_files: 5770 with open(err_file, "r") as f: 5771 for line in f: 5772 message = line.strip() 5773 error_message_command_all.append(message) 5774 if line.startswith("[W::") or line.startswith("WARNING"): 5775 error_message_command_warning.append(message) 5776 if line.startswith("[E::") or line.startswith("ERROR"): 5777 error_message_command_err.append( 5778 f"{err_file}: " + message 5779 ) 5780 # log info 5781 for message in list( 5782 set(error_message_command_err + error_message_command_warning) 5783 ): 5784 log.info(f" {message}") 5785 # debug info 5786 for message in list(set(error_message_command_all)): 5787 log.debug(f" {message}") 5788 # failed 5789 if len(error_message_command_err): 5790 log.error("Annotation failed: Error in commands") 5791 raise ValueError("Annotation failed: Error in commands") 5792 5793 if tmp_annotates_vcf_name_list: 5794 5795 # List of annotated files 5796 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5797 5798 # Tmp file 5799 tmp_annotate_vcf = NamedTemporaryFile( 5800 prefix=self.get_prefix(), 5801 dir=self.get_tmp_dir(), 5802 suffix=".vcf.gz", 5803 delete=False, 5804 ) 5805 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5806 tmp_files.append(tmp_annotate_vcf_name) 5807 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5808 err_files.append(tmp_annotate_vcf_name_err) 5809 tmp_files.append(tmp_annotate_vcf_name_err) 5810 5811 # Command merge 5812 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5813 log.info( 5814 f"Annotation Annovar - Annotation merging " 5815 + str(len(tmp_annotates_vcf_name_list)) 5816 + " annotated files" 5817 ) 5818 log.debug(f"Annotation - merge command: {merge_command}") 5819 run_parallel_commands([merge_command], 1) 5820 5821 # Find annotation in header 5822 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5823 header_list = self.read_vcf_header(f) 5824 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5825 5826 for ann in annovar_vcf_header.infos: 5827 if ann not in self.get_header().infos: 5828 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5829 5830 # Update variants 5831 log.info(f"Annotation Annovar - Updating...") 5832 self.update_from_vcf(tmp_annotate_vcf_name) 5833 5834 # Clean files 5835 # Tmp file remove command 5836 if True: 5837 tmp_files_remove_command = "" 5838 if tmp_files: 5839 tmp_files_remove_command = " ".join(tmp_files) 5840 clean_command = f" rm -f {tmp_files_remove_command} " 5841 log.debug(f"Annotation Annovar - Annotation cleaning ") 5842 log.debug(f"Annotation - cleaning command: {clean_command}") 5843 run_parallel_commands([clean_command], 1)
It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations
Parameters
- threads: number of threads to use
Returns
the value of the variable "return_value".
5846 def annotation_parquet(self, threads: int = None) -> None: 5847 """ 5848 It takes a VCF file, and annotates it with a parquet file 5849 5850 :param threads: number of threads to use for the annotation 5851 :return: the value of the variable "result". 5852 """ 5853 5854 # DEBUG 5855 log.debug("Start annotation with parquet databases") 5856 5857 # Threads 5858 if not threads: 5859 threads = self.get_threads() 5860 log.debug("Threads: " + str(threads)) 5861 5862 # DEBUG 5863 delete_tmp = True 5864 if self.get_config().get("verbosity", "warning") in ["debug"]: 5865 delete_tmp = False 5866 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5867 5868 # Config 5869 databases_folders = set( 5870 self.get_config() 5871 .get("folders", {}) 5872 .get("databases", {}) 5873 .get("annotations", ["."]) 5874 + self.get_config() 5875 .get("folders", {}) 5876 .get("databases", {}) 5877 .get("parquet", ["."]) 5878 ) 5879 log.debug("Databases annotations: " + str(databases_folders)) 5880 5881 # Param 5882 annotations = ( 5883 self.get_param() 5884 .get("annotation", {}) 5885 .get("parquet", {}) 5886 .get("annotations", None) 5887 ) 5888 log.debug("Annotations: " + str(annotations)) 5889 5890 # Assembly 5891 assembly = self.get_param().get( 5892 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5893 ) 5894 5895 # Force Update Annotation 5896 force_update_annotation = ( 5897 self.get_param() 5898 .get("annotation", {}) 5899 .get("options", {}) 5900 .get("annotations_update", False) 5901 ) 5902 log.debug(f"force_update_annotation={force_update_annotation}") 5903 force_append_annotation = ( 5904 self.get_param() 5905 .get("annotation", {}) 5906 .get("options", {}) 5907 .get("annotations_append", False) 5908 ) 5909 log.debug(f"force_append_annotation={force_append_annotation}") 5910 5911 # Data 5912 table_variants = self.get_table_variants() 5913 5914 # Check if not empty 5915 log.debug("Check if not empty") 5916 sql_query_chromosomes_df = self.get_query_to_df( 5917 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5918 ) 5919 if not sql_query_chromosomes_df["count"][0]: 5920 log.info(f"VCF empty") 5921 return 5922 5923 # VCF header 5924 vcf_reader = self.get_header() 5925 log.debug("Initial header: " + str(vcf_reader.infos)) 5926 5927 # Nb Variants POS 5928 log.debug("NB Variants Start") 5929 nb_variants = self.conn.execute( 5930 f"SELECT count(*) AS count FROM variants" 5931 ).fetchdf()["count"][0] 5932 log.debug("NB Variants Stop") 5933 5934 # Existing annotations 5935 for vcf_annotation in self.get_header().infos: 5936 5937 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5938 log.debug( 5939 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5940 ) 5941 5942 # Added columns 5943 added_columns = [] 5944 5945 # drop indexes 5946 log.debug(f"Drop indexes...") 5947 self.drop_indexes() 5948 5949 if annotations: 5950 5951 if "ALL" in annotations: 5952 5953 all_param = annotations.get("ALL", {}) 5954 all_param_formats = all_param.get("formats", None) 5955 all_param_releases = all_param.get("releases", None) 5956 5957 databases_infos_dict = self.scan_databases( 5958 database_formats=all_param_formats, 5959 database_releases=all_param_releases, 5960 ) 5961 for database_infos in databases_infos_dict.keys(): 5962 if database_infos not in annotations: 5963 annotations[database_infos] = {"INFO": None} 5964 5965 for annotation in annotations: 5966 5967 if annotation in ["ALL"]: 5968 continue 5969 5970 # Annotation Name 5971 annotation_name = os.path.basename(annotation) 5972 5973 # Annotation fields 5974 annotation_fields = annotations[annotation] 5975 if not annotation_fields: 5976 annotation_fields = {"INFO": None} 5977 5978 log.debug(f"Annotation '{annotation_name}'") 5979 log.debug( 5980 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5981 ) 5982 5983 # Create Database 5984 database = Database( 5985 database=annotation, 5986 databases_folders=databases_folders, 5987 assembly=assembly, 5988 ) 5989 5990 # Find files 5991 parquet_file = database.get_database() 5992 parquet_hdr_file = database.get_header_file() 5993 parquet_type = database.get_type() 5994 5995 # Check if files exists 5996 if not parquet_file or not parquet_hdr_file: 5997 msg_err_list = [] 5998 if not parquet_file: 5999 msg_err_list.append( 6000 f"Annotation failed: Annotation file not found" 6001 ) 6002 if parquet_file and not parquet_hdr_file: 6003 msg_err_list.append( 6004 f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'" 6005 ) 6006 6007 log.error(". ".join(msg_err_list)) 6008 raise ValueError(". ".join(msg_err_list)) 6009 else: 6010 # Get parquet connexion 6011 parquet_sql_attach = database.get_sql_database_attach( 6012 output="query" 6013 ) 6014 if parquet_sql_attach: 6015 self.conn.execute(parquet_sql_attach) 6016 parquet_file_link = database.get_sql_database_link() 6017 # Log 6018 log.debug( 6019 f"Annotation '{annotation_name}' - file: " 6020 + str(parquet_file) 6021 + " and " 6022 + str(parquet_hdr_file) 6023 ) 6024 6025 # Database full header columns 6026 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 6027 parquet_hdr_file 6028 ) 6029 # Log 6030 log.debug( 6031 "Annotation database header columns : " 6032 + str(parquet_hdr_vcf_header_columns) 6033 ) 6034 6035 # Load header as VCF object 6036 parquet_hdr_vcf_header_infos = database.get_header().infos 6037 # Log 6038 log.debug( 6039 "Annotation database header: " 6040 + str(parquet_hdr_vcf_header_infos) 6041 ) 6042 6043 # Get extra infos 6044 parquet_columns = database.get_extra_columns() 6045 # Log 6046 log.debug("Annotation database Columns: " + str(parquet_columns)) 6047 6048 # Add extra columns if "ALL" in annotation_fields 6049 # if "ALL" in annotation_fields: 6050 # allow_add_extra_column = True 6051 if "ALL" in annotation_fields and database.get_extra_columns(): 6052 for extra_column in database.get_extra_columns(): 6053 if ( 6054 extra_column not in annotation_fields 6055 and extra_column.replace("INFO/", "") 6056 not in parquet_hdr_vcf_header_infos 6057 ): 6058 parquet_hdr_vcf_header_infos[extra_column] = ( 6059 vcf.parser._Info( 6060 extra_column, 6061 ".", 6062 "String", 6063 f"{extra_column} description", 6064 "unknown", 6065 "unknown", 6066 self.code_type_map["String"], 6067 ) 6068 ) 6069 6070 # For all fields in database 6071 annotation_fields_all = False 6072 if "ALL" in annotation_fields or "INFO" in annotation_fields: 6073 annotation_fields_all = True 6074 annotation_fields = { 6075 key: key for key in parquet_hdr_vcf_header_infos 6076 } 6077 6078 log.debug( 6079 "Annotation database header - All annotations added: " 6080 + str(annotation_fields) 6081 ) 6082 6083 # Init 6084 6085 # List of annotation fields to use 6086 sql_query_annotation_update_info_sets = [] 6087 6088 # List of annotation to agregate 6089 sql_query_annotation_to_agregate = [] 6090 6091 # Number of fields 6092 nb_annotation_field = 0 6093 6094 # Annotation fields processed 6095 annotation_fields_processed = [] 6096 6097 # Columns mapping 6098 map_columns = database.map_columns( 6099 columns=annotation_fields, prefixes=["INFO/"] 6100 ) 6101 6102 # Query dict for fields to remove (update option) 6103 query_dict_remove = {} 6104 6105 # Fetch Anotation fields 6106 for annotation_field in annotation_fields: 6107 6108 # annotation_field_column 6109 annotation_field_column = map_columns.get( 6110 annotation_field, "INFO" 6111 ) 6112 6113 # field new name, if parametered 6114 annotation_fields_new_name = annotation_fields.get( 6115 annotation_field, annotation_field 6116 ) 6117 if not annotation_fields_new_name: 6118 annotation_fields_new_name = annotation_field 6119 6120 # To annotate 6121 # force_update_annotation = True 6122 # force_append_annotation = True 6123 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 6124 if annotation_field in parquet_hdr_vcf_header_infos and ( 6125 force_update_annotation 6126 or force_append_annotation 6127 or ( 6128 annotation_fields_new_name 6129 not in self.get_header().infos 6130 ) 6131 ): 6132 6133 # Add field to annotation to process list 6134 annotation_fields_processed.append( 6135 annotation_fields_new_name 6136 ) 6137 6138 # explode infos for the field 6139 annotation_fields_new_name_info_msg = "" 6140 if ( 6141 force_update_annotation 6142 and annotation_fields_new_name 6143 in self.get_header().infos 6144 ): 6145 # Remove field from INFO 6146 query = f""" 6147 UPDATE {table_variants} as table_variants 6148 SET INFO = REGEXP_REPLACE( 6149 concat(table_variants.INFO,''), 6150 ';*{annotation_fields_new_name}=[^;]*', 6151 '' 6152 ) 6153 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 6154 """ 6155 annotation_fields_new_name_info_msg = " [update]" 6156 query_dict_remove[ 6157 f"remove 'INFO/{annotation_fields_new_name}'" 6158 ] = query 6159 6160 # Sep between fields in INFO 6161 nb_annotation_field += 1 6162 if nb_annotation_field > 1: 6163 annotation_field_sep = ";" 6164 else: 6165 annotation_field_sep = "" 6166 6167 log.info( 6168 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 6169 ) 6170 6171 # Add INFO field to header 6172 parquet_hdr_vcf_header_infos_number = ( 6173 parquet_hdr_vcf_header_infos[annotation_field].num 6174 or "." 6175 ) 6176 parquet_hdr_vcf_header_infos_type = ( 6177 parquet_hdr_vcf_header_infos[annotation_field].type 6178 or "String" 6179 ) 6180 parquet_hdr_vcf_header_infos_description = ( 6181 parquet_hdr_vcf_header_infos[annotation_field].desc 6182 or f"{annotation_field} description" 6183 ) 6184 parquet_hdr_vcf_header_infos_source = ( 6185 parquet_hdr_vcf_header_infos[annotation_field].source 6186 or "unknown" 6187 ) 6188 parquet_hdr_vcf_header_infos_version = ( 6189 parquet_hdr_vcf_header_infos[annotation_field].version 6190 or "unknown" 6191 ) 6192 6193 vcf_reader.infos[annotation_fields_new_name] = ( 6194 vcf.parser._Info( 6195 annotation_fields_new_name, 6196 parquet_hdr_vcf_header_infos_number, 6197 parquet_hdr_vcf_header_infos_type, 6198 parquet_hdr_vcf_header_infos_description, 6199 parquet_hdr_vcf_header_infos_source, 6200 parquet_hdr_vcf_header_infos_version, 6201 self.code_type_map[ 6202 parquet_hdr_vcf_header_infos_type 6203 ], 6204 ) 6205 ) 6206 6207 # Append 6208 if force_append_annotation: 6209 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 6210 else: 6211 query_case_when_append = "" 6212 6213 # Annotation/Update query fields 6214 # Found in INFO column 6215 if ( 6216 annotation_field_column == "INFO" 6217 and "INFO" in parquet_hdr_vcf_header_columns 6218 ): 6219 sql_query_annotation_update_info_sets.append( 6220 f""" 6221 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 6222 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 6223 ELSE '' 6224 END 6225 """ 6226 ) 6227 # Found in a specific column 6228 else: 6229 sql_query_annotation_update_info_sets.append( 6230 f""" 6231 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 6232 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 6233 ELSE '' 6234 END 6235 """ 6236 ) 6237 sql_query_annotation_to_agregate.append( 6238 f""" string_agg(table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6239 ) 6240 6241 # Not to annotate 6242 else: 6243 6244 if force_update_annotation: 6245 annotation_message = "forced" 6246 else: 6247 annotation_message = "skipped" 6248 6249 if annotation_field not in parquet_hdr_vcf_header_infos: 6250 log.warning( 6251 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 6252 ) 6253 if annotation_fields_new_name in self.get_header().infos: 6254 log.warning( 6255 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 6256 ) 6257 6258 # Check if ALL fields have to be annotated. Thus concat all INFO field 6259 # allow_annotation_full_info = True 6260 allow_annotation_full_info = not force_append_annotation 6261 6262 if parquet_type in ["regions"]: 6263 allow_annotation_full_info = False 6264 6265 if ( 6266 allow_annotation_full_info 6267 and nb_annotation_field == len(annotation_fields) 6268 and annotation_fields_all 6269 and ( 6270 "INFO" in parquet_hdr_vcf_header_columns 6271 and "INFO" in database.get_extra_columns() 6272 ) 6273 ): 6274 log.debug("Column INFO annotation enabled") 6275 sql_query_annotation_update_info_sets = [] 6276 sql_query_annotation_update_info_sets.append( 6277 f" table_parquet.INFO " 6278 ) 6279 6280 if sql_query_annotation_update_info_sets: 6281 6282 # Annotate 6283 log.info(f"Annotation '{annotation_name}' - Annotation...") 6284 6285 # Join query annotation update info sets for SQL 6286 sql_query_annotation_update_info_sets_sql = ",".join( 6287 sql_query_annotation_update_info_sets 6288 ) 6289 6290 # Check chromosomes list (and variants infos) 6291 sql_query_chromosomes = f""" 6292 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 6293 FROM {table_variants} as table_variants 6294 GROUP BY table_variants."#CHROM" 6295 ORDER BY table_variants."#CHROM" 6296 """ 6297 sql_query_chromosomes_df = self.conn.execute( 6298 sql_query_chromosomes 6299 ).df() 6300 sql_query_chromosomes_dict = { 6301 entry["CHROM"]: { 6302 "count": entry["count_variants"], 6303 "min": entry["min_variants"], 6304 "max": entry["max_variants"], 6305 } 6306 for index, entry in sql_query_chromosomes_df.iterrows() 6307 } 6308 6309 # Init 6310 nb_of_query = 0 6311 nb_of_variant_annotated = 0 6312 query_dict = query_dict_remove 6313 6314 # for chrom in sql_query_chromosomes_df["CHROM"]: 6315 for chrom in sql_query_chromosomes_dict: 6316 6317 # Number of variant by chromosome 6318 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 6319 chrom, {} 6320 ).get("count", 0) 6321 6322 log.debug( 6323 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 6324 ) 6325 6326 # Annotation with regions database 6327 if parquet_type in ["regions"]: 6328 sql_query_annotation_from_clause = f""" 6329 FROM ( 6330 SELECT 6331 '{chrom}' AS \"#CHROM\", 6332 table_variants_from.\"POS\" AS \"POS\", 6333 {",".join(sql_query_annotation_to_agregate)} 6334 FROM {table_variants} as table_variants_from 6335 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 6336 table_parquet_from."#CHROM" = '{chrom}' 6337 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 6338 AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 6339 ) 6340 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 6341 GROUP BY table_variants_from.\"POS\" 6342 ) 6343 as table_parquet 6344 """ 6345 6346 sql_query_annotation_where_clause = """ 6347 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6348 AND table_parquet.\"POS\" = table_variants.\"POS\" 6349 """ 6350 6351 # Annotation with variants database 6352 else: 6353 sql_query_annotation_from_clause = f""" 6354 FROM {parquet_file_link} as table_parquet 6355 """ 6356 sql_query_annotation_where_clause = f""" 6357 table_variants."#CHROM" = '{chrom}' 6358 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6359 AND table_parquet.\"POS\" = table_variants.\"POS\" 6360 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 6361 AND table_parquet.\"REF\" = table_variants.\"REF\" 6362 """ 6363 6364 # Create update query 6365 sql_query_annotation_chrom_interval_pos = f""" 6366 UPDATE {table_variants} as table_variants 6367 SET INFO = 6368 concat( 6369 CASE WHEN table_variants.INFO NOT IN ('','.') 6370 THEN table_variants.INFO 6371 ELSE '' 6372 END 6373 , 6374 CASE WHEN table_variants.INFO NOT IN ('','.') 6375 AND ( 6376 concat({sql_query_annotation_update_info_sets_sql}) 6377 ) 6378 NOT IN ('','.') 6379 THEN ';' 6380 ELSE '' 6381 END 6382 , 6383 {sql_query_annotation_update_info_sets_sql} 6384 ) 6385 {sql_query_annotation_from_clause} 6386 WHERE {sql_query_annotation_where_clause} 6387 ; 6388 """ 6389 6390 # Add update query to dict 6391 query_dict[ 6392 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6393 ] = sql_query_annotation_chrom_interval_pos 6394 6395 nb_of_query = len(query_dict) 6396 num_query = 0 6397 6398 # SET max_expression_depth TO x 6399 self.conn.execute("SET max_expression_depth TO 10000") 6400 6401 for query_name in query_dict: 6402 query = query_dict[query_name] 6403 num_query += 1 6404 log.info( 6405 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6406 ) 6407 result = self.conn.execute(query) 6408 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6409 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6410 log.info( 6411 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6412 ) 6413 6414 log.info( 6415 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6416 ) 6417 6418 else: 6419 6420 log.info( 6421 f"Annotation '{annotation_name}' - No Annotations available" 6422 ) 6423 6424 log.debug("Final header: " + str(vcf_reader.infos)) 6425 6426 # Remove added columns 6427 for added_column in added_columns: 6428 self.drop_column(column=added_column)
It takes a VCF file, and annotates it with a parquet file
Parameters
- threads: number of threads to use for the annotation
Returns
the value of the variable "result".
6430 def annotation_splice(self, threads: int = None) -> None: 6431 """ 6432 This function annotate with snpEff 6433 6434 :param threads: The number of threads to use 6435 :return: the value of the variable "return_value". 6436 """ 6437 6438 # DEBUG 6439 log.debug("Start annotation with splice tools") 6440 6441 # Threads 6442 if not threads: 6443 threads = self.get_threads() 6444 log.debug("Threads: " + str(threads)) 6445 6446 # DEBUG 6447 delete_tmp = True 6448 if self.get_config().get("verbosity", "warning") in ["debug"]: 6449 delete_tmp = False 6450 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6451 6452 # Config 6453 config = self.get_config() 6454 log.debug("Config: " + str(config)) 6455 splice_config = config.get("tools", {}).get("splice", {}) 6456 if not splice_config: 6457 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6458 msg_err = "No Splice tool config" 6459 raise ValueError(msg_err) 6460 log.debug(f"splice_config: {splice_config}") 6461 6462 # Config - Folders - Databases 6463 databases_folders = ( 6464 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6465 ) 6466 log.debug("Databases annotations: " + str(databases_folders)) 6467 6468 # Splice docker image 6469 splice_docker_image = splice_config.get("docker").get("image") 6470 6471 # Pull splice image if it's not already there 6472 if not check_docker_image_exists(splice_docker_image): 6473 log.warning( 6474 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6475 ) 6476 try: 6477 command(f"docker pull {splice_config.get('docker').get('image')}") 6478 except subprocess.CalledProcessError: 6479 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6480 log.error(msg_err) 6481 raise ValueError(msg_err) 6482 6483 # Config - splice databases 6484 splice_databases = ( 6485 config.get("folders", {}) 6486 .get("databases", {}) 6487 .get("splice", DEFAULT_SPLICE_FOLDER) 6488 ) 6489 splice_databases = full_path(splice_databases) 6490 6491 # Param 6492 param = self.get_param() 6493 log.debug("Param: " + str(param)) 6494 6495 # Param 6496 options = param.get("annotation", {}).get("splice", {}).get("options", {}) 6497 log.debug("Options: " + str(options)) 6498 6499 # Data 6500 table_variants = self.get_table_variants() 6501 6502 # Check if not empty 6503 log.debug("Check if not empty") 6504 sql_query_chromosomes = ( 6505 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6506 ) 6507 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6508 log.info("VCF empty") 6509 return None 6510 6511 # Export in VCF 6512 log.debug("Create initial file to annotate") 6513 6514 # Create output folder / work folder 6515 if options.get("output_folder", ""): 6516 output_folder = options.get("output_folder", "") 6517 if not os.path.exists(output_folder): 6518 Path(output_folder).mkdir(parents=True, exist_ok=True) 6519 else: 6520 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6521 if not os.path.exists(output_folder): 6522 Path(output_folder).mkdir(parents=True, exist_ok=True) 6523 6524 if options.get("workdir", ""): 6525 workdir = options.get("workdir", "") 6526 else: 6527 workdir = "/work" 6528 6529 # Create tmp VCF file 6530 tmp_vcf = NamedTemporaryFile( 6531 prefix=self.get_prefix(), 6532 dir=output_folder, 6533 suffix=".vcf", 6534 delete=False, 6535 ) 6536 tmp_vcf_name = tmp_vcf.name 6537 6538 # VCF header 6539 header = self.get_header() 6540 6541 # Existing annotations 6542 for vcf_annotation in self.get_header().infos: 6543 6544 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6545 log.debug( 6546 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6547 ) 6548 6549 # Memory limit 6550 if config.get("memory", None): 6551 memory_limit = config.get("memory", "8G").upper() 6552 # upper() 6553 else: 6554 memory_limit = "8G" 6555 log.debug(f"memory_limit: {memory_limit}") 6556 6557 # Check number of variants to annotate 6558 where_clause_regex_spliceai = r"SpliceAI_\w+" 6559 where_clause_regex_spip = r"SPiP_\w+" 6560 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6561 df_list_of_variants_to_annotate = self.get_query_to_df( 6562 query=f""" SELECT * FROM variants {where_clause} """ 6563 ) 6564 if len(df_list_of_variants_to_annotate) == 0: 6565 log.warning( 6566 f"No variants to annotate with splice. Variants probably already annotated with splice" 6567 ) 6568 return None 6569 else: 6570 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6571 6572 # Export VCF file 6573 self.export_variant_vcf( 6574 vcf_file=tmp_vcf_name, 6575 remove_info=True, 6576 add_samples=True, 6577 index=False, 6578 where_clause=where_clause, 6579 ) 6580 mount = [f" -v {path}:{path}:rw" for path in [output_folder]] 6581 if any(value for value in splice_config.values() if value is None): 6582 log.warning("At least one splice config parameter is empty") 6583 # exit annotation_splice 6584 return None 6585 6586 # Params in splice nf 6587 def check_values(dico: dict): 6588 """ 6589 Ensure parameters for NF splice pipeline 6590 """ 6591 for key, val in dico.items(): 6592 if key == "genome": 6593 if any( 6594 assemb in options.get("genome", {}) 6595 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6596 ): 6597 yield f"--{key} hg19" 6598 elif any( 6599 assemb in options.get("genome", {}) 6600 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6601 ): 6602 yield f"--{key} hg38" 6603 elif ( 6604 (isinstance(val, str) and val) 6605 or isinstance(val, int) 6606 or isinstance(val, bool) 6607 ): 6608 yield f"--{key} {val}" 6609 6610 # Genome 6611 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6612 options["genome"] = genome 6613 # NF params 6614 nf_params = [] 6615 # Add options 6616 if options: 6617 log.debug(options) 6618 nf_params = list(check_values(options)) 6619 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6620 else: 6621 log.debug("No NF params provided") 6622 # Add threads 6623 if "threads" not in options.keys(): 6624 nf_params.append(f"--threads {threads}") 6625 # Genome path 6626 genome_path = find_genome( 6627 config.get("folders", {}) 6628 .get("databases", {}) 6629 .get("genomes", DEFAULT_GENOME_FOLDER), 6630 file=f"{genome}.fa", 6631 ) 6632 # Add genome path 6633 if not genome_path: 6634 raise ValueError( 6635 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6636 ) 6637 else: 6638 log.debug(f"Genome: {genome_path}") 6639 nf_params.append(f"--genome_path {genome_path}") 6640 6641 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6642 """ 6643 Setting up updated databases for SPiP and SpliceAI 6644 """ 6645 6646 try: 6647 6648 # SpliceAI assembly transcriptome 6649 spliceai_assembly = os.path.join( 6650 config.get("folders", {}).get("databases", {}).get("spliceai", {}), 6651 options.get("genome"), 6652 "transcriptome", 6653 ) 6654 spip_assembly = options.get("genome") 6655 6656 spip = find( 6657 f"transcriptome_{spip_assembly}.RData", 6658 config.get("folders", {}).get("databases", {}).get("spip", {}), 6659 ) 6660 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6661 log.debug(f"SPiP annotations: {spip}") 6662 log.debug(f"SpliceAI annotations: {spliceai}") 6663 if spip and spliceai: 6664 return [ 6665 f"--spip_transcriptome {spip}", 6666 f"--spliceai_transcriptome {spliceai}", 6667 ] 6668 else: 6669 log.warning( 6670 "Can't find splice databases in configuration, use annotations file from image" 6671 ) 6672 except TypeError: 6673 log.warning( 6674 "Can't find splice databases in configuration, use annotations file from image" 6675 ) 6676 return [] 6677 6678 # Add options, check if transcriptome option have already beend provided 6679 if ( 6680 "spip_transcriptome" not in nf_params 6681 and "spliceai_transcriptome" not in nf_params 6682 ): 6683 splice_reference = splice_annotations(options, config) 6684 if splice_reference: 6685 nf_params.extend(splice_reference) 6686 # nf_params.append(f"--output_folder {output_folder}") 6687 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6688 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6689 log.debug(cmd) 6690 splice_config["docker"]["command"] = cmd 6691 6692 # Ensure proxy is set 6693 proxy = [ 6694 f"-e {var}={os.getenv(var)}" 6695 for var in ["https_proxy", "http_proxy", "ftp_proxy"] 6696 if os.getenv(var) is not None 6697 ] 6698 docker_cmd = get_bin_command( 6699 tool="splice", 6700 bin_type="docker", 6701 config=config, 6702 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6703 add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}", 6704 ) 6705 # print(docker_cmd) 6706 # exit() 6707 # Docker debug 6708 # if splice_config.get("rm_container"): 6709 # rm_container = "--rm" 6710 # else: 6711 # rm_container = "" 6712 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6713 log.debug(docker_cmd) 6714 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6715 log.debug(res.stdout) 6716 if res.stderr: 6717 log.error(res.stderr) 6718 res.check_returncode() 6719 # Update variants 6720 log.info("Annotation - Updating...") 6721 # Test find output vcf 6722 log.debug( 6723 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6724 ) 6725 output_vcf = [] 6726 # Wrong folder to look in 6727 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6728 if ( 6729 files 6730 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6731 ): 6732 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6733 # log.debug(os.listdir(options.get("output_folder"))) 6734 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6735 if not output_vcf: 6736 log.debug( 6737 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6738 ) 6739 else: 6740 # Get new header from annotated vcf 6741 log.debug(f"Initial header: {len(header.infos)} fields") 6742 # Create new header with splice infos 6743 new_vcf = Variants(input=output_vcf[0]) 6744 new_vcf_header = new_vcf.get_header().infos 6745 for keys, infos in new_vcf_header.items(): 6746 if keys not in header.infos.keys(): 6747 header.infos[keys] = infos 6748 log.debug(f"New header: {len(header.infos)} fields") 6749 log.debug(f"Splice tmp output: {output_vcf[0]}") 6750 self.update_from_vcf(output_vcf[0]) 6751 6752 # Remove file 6753 remove_if_exists(output_vcf)
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
6759 def get_config_default(self, name: str) -> dict: 6760 """ 6761 The function `get_config_default` returns a dictionary containing default configurations for 6762 various calculations and prioritizations. 6763 6764 :param name: The `get_config_default` function returns a dictionary containing default 6765 configurations for different calculations and prioritizations. The `name` parameter is used to 6766 specify which specific configuration to retrieve from the dictionary 6767 :type name: str 6768 :return: The function `get_config_default` returns a dictionary containing default configuration 6769 settings for different calculations and prioritizations. The specific configuration settings are 6770 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6771 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6772 returned. If there is no match, an empty dictionary is returned. 6773 """ 6774 6775 config_default = { 6776 "calculations": { 6777 "variant_chr_pos_alt_ref": { 6778 "type": "sql", 6779 "name": "variant_chr_pos_alt_ref", 6780 "description": "Create a variant ID with chromosome, position, alt and ref", 6781 "available": False, 6782 "output_column_name": "variant_chr_pos_alt_ref", 6783 "output_column_type": "String", 6784 "output_column_description": "variant ID with chromosome, position, alt and ref", 6785 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6786 "operation_info": True, 6787 }, 6788 "VARTYPE": { 6789 "type": "sql", 6790 "name": "VARTYPE", 6791 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6792 "available": True, 6793 "table": "variants", 6794 "output_column_name": "VARTYPE", 6795 "output_column_type": "String", 6796 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6797 "operation_query": """ 6798 CASE 6799 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6800 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6801 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6802 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6803 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6804 ELSE 'UNDEFINED' 6805 END 6806 """, 6807 "info_fields": ["SVTYPE"], 6808 "operation_info": True, 6809 }, 6810 "snpeff_hgvs": { 6811 "type": "python", 6812 "name": "snpeff_hgvs", 6813 "description": "HGVS nomenclatures from snpEff annotation", 6814 "available": True, 6815 "function_name": "calculation_extract_snpeff_hgvs", 6816 "function_params": ["snpeff_hgvs", "ANN"], 6817 }, 6818 "snpeff_ann_explode": { 6819 "type": "python", 6820 "name": "snpeff_ann_explode", 6821 "description": "Explode snpEff annotations with uniquify values", 6822 "available": True, 6823 "function_name": "calculation_snpeff_ann_explode", 6824 "function_params": [False, "fields", "snpeff_", "ANN"], 6825 }, 6826 "snpeff_ann_explode_uniquify": { 6827 "type": "python", 6828 "name": "snpeff_ann_explode_uniquify", 6829 "description": "Explode snpEff annotations", 6830 "available": True, 6831 "function_name": "calculation_snpeff_ann_explode", 6832 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6833 }, 6834 "snpeff_ann_explode_json": { 6835 "type": "python", 6836 "name": "snpeff_ann_explode_json", 6837 "description": "Explode snpEff annotations in JSON format", 6838 "available": True, 6839 "function_name": "calculation_snpeff_ann_explode", 6840 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6841 }, 6842 "NOMEN": { 6843 "type": "python", 6844 "name": "NOMEN", 6845 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)", 6846 "available": True, 6847 "function_name": "calculation_extract_nomen", 6848 "function_params": [], 6849 }, 6850 "RENAME_INFO_FIELDS": { 6851 "type": "python", 6852 "name": "RENAME_INFO_FIELDS", 6853 "description": "Rename or remove INFO/tags", 6854 "available": True, 6855 "function_name": "calculation_rename_info_fields", 6856 "function_params": [], 6857 }, 6858 "FINDBYPIPELINE": { 6859 "type": "python", 6860 "name": "FINDBYPIPELINE", 6861 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6862 "available": True, 6863 "function_name": "calculation_find_by_pipeline", 6864 "function_params": ["findbypipeline"], 6865 }, 6866 "FINDBYSAMPLE": { 6867 "type": "python", 6868 "name": "FINDBYSAMPLE", 6869 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6870 "available": True, 6871 "function_name": "calculation_find_by_pipeline", 6872 "function_params": ["findbysample"], 6873 }, 6874 "GENOTYPECONCORDANCE": { 6875 "type": "python", 6876 "name": "GENOTYPECONCORDANCE", 6877 "description": "Concordance of genotype for multi caller VCF", 6878 "available": True, 6879 "function_name": "calculation_genotype_concordance", 6880 "function_params": [], 6881 }, 6882 "BARCODE": { 6883 "type": "python", 6884 "name": "BARCODE", 6885 "description": "BARCODE as VaRank tool", 6886 "available": True, 6887 "function_name": "calculation_barcode", 6888 "function_params": [], 6889 }, 6890 "BARCODEFAMILY": { 6891 "type": "python", 6892 "name": "BARCODEFAMILY", 6893 "description": "BARCODEFAMILY as VaRank tool", 6894 "available": True, 6895 "function_name": "calculation_barcode_family", 6896 "function_params": ["BCF"], 6897 }, 6898 "TRIO": { 6899 "type": "python", 6900 "name": "TRIO", 6901 "description": "Inheritance for a trio family", 6902 "available": True, 6903 "function_name": "calculation_trio", 6904 "function_params": [], 6905 }, 6906 "VAF": { 6907 "type": "python", 6908 "name": "VAF", 6909 "description": "Variant Allele Frequency (VAF) harmonization", 6910 "available": True, 6911 "function_name": "calculation_vaf_normalization", 6912 "function_params": [], 6913 }, 6914 "VAF_stats": { 6915 "type": "python", 6916 "name": "VAF_stats", 6917 "description": "Variant Allele Frequency (VAF) statistics", 6918 "available": True, 6919 "function_name": "calculation_genotype_stats", 6920 "function_params": ["VAF"], 6921 }, 6922 "DP_stats": { 6923 "type": "python", 6924 "name": "DP_stats", 6925 "description": "Depth (DP) statistics", 6926 "available": True, 6927 "function_name": "calculation_genotype_stats", 6928 "function_params": ["DP"], 6929 }, 6930 "variant_id": { 6931 "type": "python", 6932 "name": "variant_id", 6933 "description": "Variant ID generated from variant position and type", 6934 "available": True, 6935 "function_name": "calculation_variant_id", 6936 "function_params": [], 6937 }, 6938 "transcripts_json": { 6939 "type": "python", 6940 "name": "transcripts_json", 6941 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6942 "available": True, 6943 "function_name": "calculation_transcripts_annotation", 6944 "function_params": ["transcripts_json", None], 6945 }, 6946 "transcripts_ann": { 6947 "type": "python", 6948 "name": "transcripts_ann", 6949 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6950 "available": True, 6951 "function_name": "calculation_transcripts_annotation", 6952 "function_params": [None, "transcripts_ann"], 6953 }, 6954 "transcripts_annotations": { 6955 "type": "python", 6956 "name": "transcripts_annotations", 6957 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6958 "available": True, 6959 "function_name": "calculation_transcripts_annotation", 6960 "function_params": [None, None], 6961 }, 6962 "transcripts_prioritization": { 6963 "type": "python", 6964 "name": "transcripts_prioritization", 6965 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6966 "available": True, 6967 "function_name": "calculation_transcripts_prioritization", 6968 "function_params": [], 6969 }, 6970 "transcripts_export": { 6971 "type": "python", 6972 "name": "transcripts_export", 6973 "description": "Export transcripts table/view as a file (using param.json)", 6974 "available": True, 6975 "function_name": "calculation_transcripts_export", 6976 "function_params": [], 6977 }, 6978 }, 6979 "prioritizations": { 6980 "default": { 6981 "ANN2": [ 6982 { 6983 "type": "contains", 6984 "value": "HIGH", 6985 "score": 5, 6986 "flag": "PASS", 6987 "comment": [ 6988 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6989 ], 6990 }, 6991 { 6992 "type": "contains", 6993 "value": "MODERATE", 6994 "score": 3, 6995 "flag": "PASS", 6996 "comment": [ 6997 "A non-disruptive variant that might change protein effectiveness" 6998 ], 6999 }, 7000 { 7001 "type": "contains", 7002 "value": "LOW", 7003 "score": 0, 7004 "flag": "FILTERED", 7005 "comment": [ 7006 "Assumed to be mostly harmless or unlikely to change protein behavior" 7007 ], 7008 }, 7009 { 7010 "type": "contains", 7011 "value": "MODIFIER", 7012 "score": 0, 7013 "flag": "FILTERED", 7014 "comment": [ 7015 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 7016 ], 7017 }, 7018 ], 7019 } 7020 }, 7021 } 7022 7023 return config_default.get(name, None)
The function get_config_default returns a dictionary containing default configurations for
various calculations and prioritizations.
Parameters
- name: The
get_config_defaultfunction returns a dictionary containing default configurations for different calculations and prioritizations. Thenameparameter is used to specify which specific configuration to retrieve from the dictionary
Returns
The function
get_config_defaultreturns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the inputnameparameter provided to the function. If thenameparameter matches a key in theconfig_defaultdictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.
7025 def get_config_json( 7026 self, name: str, config_dict: dict = {}, config_file: str = None 7027 ) -> dict: 7028 """ 7029 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 7030 default values, a dictionary, and a file. 7031 7032 :param name: The `name` parameter in the `get_config_json` function is a string that represents 7033 the name of the configuration. It is used to identify and retrieve the configuration settings 7034 for a specific component or module 7035 :type name: str 7036 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 7037 dictionary that allows you to provide additional configuration settings or overrides. When you 7038 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 7039 the key is the configuration setting you want to override or 7040 :type config_dict: dict 7041 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 7042 specify the path to a configuration file that contains additional settings. If provided, the 7043 function will read the contents of this file and update the configuration dictionary with the 7044 values found in the file, overriding any existing values with the 7045 :type config_file: str 7046 :return: The function `get_config_json` returns a dictionary containing the configuration 7047 settings. 7048 """ 7049 7050 # Create with default prioritizations 7051 config_default = self.get_config_default(name=name) 7052 configuration = config_default 7053 # log.debug(f"configuration={configuration}") 7054 7055 # Replace prioritizations from dict 7056 for config in config_dict: 7057 configuration[config] = config_dict[config] 7058 7059 # Replace prioritizations from file 7060 config_file = full_path(config_file) 7061 if config_file: 7062 if os.path.exists(config_file): 7063 with open(config_file) as config_file_content: 7064 config_file_dict = yaml.safe_load(config_file_content) 7065 for config in config_file_dict: 7066 configuration[config] = config_file_dict[config] 7067 else: 7068 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 7069 log.error(msg_error) 7070 raise ValueError(msg_error) 7071 7072 return configuration
The function get_config_json retrieves a configuration JSON object with prioritizations from
default values, a dictionary, and a file.
Parameters
- name: The
nameparameter in theget_config_jsonfunction is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module - config_dict: The
config_dictparameter in theget_config_jsonfunction is a dictionary that allows you to provide additional configuration settings or overrides. When you call theget_config_jsonfunction, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or - config_file: The
config_fileparameter in theget_config_jsonfunction is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns
The function
get_config_jsonreturns a dictionary containing the configuration settings.
7074 def prioritization( 7075 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 7076 ) -> bool: 7077 """ 7078 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 7079 prioritizes variants based on configured profiles and criteria. 7080 7081 :param table: The `table` parameter in the `prioritization` function is used to specify the name 7082 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 7083 a table name is provided, the method will prioritize the variants in that specific table 7084 :type table: str 7085 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 7086 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 7087 provided, the code will use a default prefix value of "PZ" 7088 :type pz_prefix: str 7089 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 7090 additional parameters specific to the prioritization process. These parameters can include 7091 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 7092 configurations needed for the prioritization of variants in a V 7093 :type pz_param: dict 7094 :return: A boolean value (True) is being returned from the `prioritization` function. 7095 """ 7096 7097 # Config 7098 config = self.get_config() 7099 7100 # Param 7101 param = self.get_param() 7102 7103 # Prioritization param 7104 if pz_param is not None: 7105 prioritization_param = pz_param 7106 else: 7107 prioritization_param = param.get("prioritization", {}) 7108 7109 # Configuration profiles 7110 prioritization_config_file = prioritization_param.get( 7111 "prioritization_config", None 7112 ) 7113 prioritization_config_file = full_path(prioritization_config_file) 7114 prioritizations_config = self.get_config_json( 7115 name="prioritizations", config_file=prioritization_config_file 7116 ) 7117 7118 # Prioritization prefix 7119 pz_prefix_default = "PZ" 7120 if pz_prefix is None: 7121 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 7122 7123 # Prioritization options 7124 profiles = prioritization_param.get("profiles", []) 7125 if isinstance(profiles, str): 7126 profiles = profiles.split(",") 7127 pzfields = prioritization_param.get( 7128 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 7129 ) 7130 if isinstance(pzfields, str): 7131 pzfields = pzfields.split(",") 7132 default_profile = prioritization_param.get("default_profile", None) 7133 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 7134 prioritization_score_mode = prioritization_param.get( 7135 "prioritization_score_mode", "HOWARD" 7136 ) 7137 7138 # Quick Prioritizations 7139 prioritizations = param.get("prioritizations", None) 7140 if prioritizations: 7141 log.info("Quick Prioritization:") 7142 for profile in prioritizations.split(","): 7143 if profile not in profiles: 7144 profiles.append(profile) 7145 log.info(f" {profile}") 7146 7147 # If profile "ALL" provided, all profiles in the config profiles 7148 if "ALL" in profiles: 7149 profiles = list(prioritizations_config.keys()) 7150 7151 for profile in profiles: 7152 if prioritizations_config.get(profile, None): 7153 log.debug(f"Profile '{profile}' configured") 7154 else: 7155 msg_error = f"Profile '{profile}' NOT configured" 7156 log.error(msg_error) 7157 raise ValueError(msg_error) 7158 7159 if profiles: 7160 log.info(f"Prioritization... ") 7161 else: 7162 log.debug(f"No profile defined") 7163 return False 7164 7165 if not default_profile and len(profiles): 7166 default_profile = profiles[0] 7167 7168 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 7169 log.debug("Profiles to check: " + str(list(profiles))) 7170 7171 # Variables 7172 if table is not None: 7173 table_variants = table 7174 else: 7175 table_variants = self.get_table_variants(clause="update") 7176 log.debug(f"Table to prioritize: {table_variants}") 7177 7178 # Added columns 7179 added_columns = [] 7180 7181 # Create list of PZfields 7182 # List of PZFields 7183 list_of_pzfields_original = pzfields + [ 7184 pzfield + pzfields_sep + profile 7185 for pzfield in pzfields 7186 for profile in profiles 7187 ] 7188 list_of_pzfields = [] 7189 log.debug(f"{list_of_pzfields_original}") 7190 7191 # Remove existing PZfields to use if exists 7192 for pzfield in list_of_pzfields_original: 7193 if self.get_header().infos.get(pzfield, None) is None: 7194 list_of_pzfields.append(pzfield) 7195 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 7196 else: 7197 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 7198 7199 if list_of_pzfields: 7200 7201 # Explode Infos prefix 7202 explode_infos_prefix = self.get_explode_infos_prefix() 7203 7204 # PZfields tags description 7205 PZfields_INFOS = { 7206 f"{pz_prefix}Tags": { 7207 "ID": f"{pz_prefix}Tags", 7208 "Number": ".", 7209 "Type": "String", 7210 "Description": "Variant tags based on annotation criteria", 7211 }, 7212 f"{pz_prefix}Score": { 7213 "ID": f"{pz_prefix}Score", 7214 "Number": 1, 7215 "Type": "Integer", 7216 "Description": "Variant score based on annotation criteria", 7217 }, 7218 f"{pz_prefix}Flag": { 7219 "ID": f"{pz_prefix}Flag", 7220 "Number": 1, 7221 "Type": "String", 7222 "Description": "Variant flag based on annotation criteria", 7223 }, 7224 f"{pz_prefix}Comment": { 7225 "ID": f"{pz_prefix}Comment", 7226 "Number": ".", 7227 "Type": "String", 7228 "Description": "Variant comment based on annotation criteria", 7229 }, 7230 f"{pz_prefix}Infos": { 7231 "ID": f"{pz_prefix}Infos", 7232 "Number": ".", 7233 "Type": "String", 7234 "Description": "Variant infos based on annotation criteria", 7235 }, 7236 f"{pz_prefix}Class": { 7237 "ID": f"{pz_prefix}Class", 7238 "Number": ".", 7239 "Type": "String", 7240 "Description": "Variant class based on annotation criteria", 7241 }, 7242 } 7243 7244 # Create INFO fields if not exist 7245 for field in PZfields_INFOS: 7246 field_ID = PZfields_INFOS[field]["ID"] 7247 field_description = PZfields_INFOS[field]["Description"] 7248 if field_ID not in self.get_header().infos and field_ID in pzfields: 7249 field_description = ( 7250 PZfields_INFOS[field]["Description"] 7251 + f", profile {default_profile}" 7252 ) 7253 self.get_header().infos[field_ID] = vcf.parser._Info( 7254 field_ID, 7255 PZfields_INFOS[field]["Number"], 7256 PZfields_INFOS[field]["Type"], 7257 field_description, 7258 "unknown", 7259 "unknown", 7260 code_type_map[PZfields_INFOS[field]["Type"]], 7261 ) 7262 7263 # Create INFO fields if not exist for each profile 7264 for profile in prioritizations_config: 7265 if profile in profiles or profiles == []: 7266 for field in PZfields_INFOS: 7267 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 7268 field_description = ( 7269 PZfields_INFOS[field]["Description"] 7270 + f", profile {profile}" 7271 ) 7272 if ( 7273 field_ID not in self.get_header().infos 7274 and field in pzfields 7275 ): 7276 self.get_header().infos[field_ID] = vcf.parser._Info( 7277 field_ID, 7278 PZfields_INFOS[field]["Number"], 7279 PZfields_INFOS[field]["Type"], 7280 field_description, 7281 "unknown", 7282 "unknown", 7283 code_type_map[PZfields_INFOS[field]["Type"]], 7284 ) 7285 7286 # Header 7287 for pzfield in list_of_pzfields: 7288 if re.match(f"{pz_prefix}Score.*", pzfield): 7289 added_column = self.add_column( 7290 table_name=table_variants, 7291 column_name=pzfield, 7292 column_type="INTEGER", 7293 default_value="0", 7294 ) 7295 elif re.match(f"{pz_prefix}Flag.*", pzfield): 7296 added_column = self.add_column( 7297 table_name=table_variants, 7298 column_name=pzfield, 7299 column_type="BOOLEAN", 7300 default_value="1", 7301 ) 7302 elif re.match(f"{pz_prefix}Class.*", pzfield): 7303 added_column = self.add_column( 7304 table_name=table_variants, 7305 column_name=pzfield, 7306 column_type="VARCHAR[]", 7307 default_value="null", 7308 ) 7309 else: 7310 added_column = self.add_column( 7311 table_name=table_variants, 7312 column_name=pzfield, 7313 column_type="STRING", 7314 default_value="''", 7315 ) 7316 added_columns.append(added_column) 7317 7318 # Profiles 7319 if profiles: 7320 7321 # foreach profile in configuration file 7322 for profile in prioritizations_config: 7323 7324 # If profile is asked in param, or ALL are asked (empty profile []) 7325 if profile in profiles or profiles == []: 7326 log.info(f"Profile '{profile}'") 7327 7328 sql_set_info_option = "" 7329 7330 sql_set_info = [] 7331 7332 # PZ fields set 7333 7334 # PZScore 7335 if ( 7336 f"{pz_prefix}Score{pzfields_sep}{profile}" 7337 in list_of_pzfields 7338 ): 7339 sql_set_info.append( 7340 f""" 7341 concat( 7342 '{pz_prefix}Score{pzfields_sep}{profile}=', 7343 {pz_prefix}Score{pzfields_sep}{profile} 7344 ) 7345 """ 7346 ) 7347 if ( 7348 profile == default_profile 7349 and f"{pz_prefix}Score" in list_of_pzfields 7350 ): 7351 sql_set_info.append( 7352 f""" 7353 concat( 7354 '{pz_prefix}Score=', 7355 {pz_prefix}Score{pzfields_sep}{profile} 7356 ) 7357 """ 7358 ) 7359 7360 # PZFlag 7361 if ( 7362 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7363 in list_of_pzfields 7364 ): 7365 sql_set_info.append( 7366 f""" 7367 concat( 7368 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7369 CASE 7370 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7371 THEN 'PASS' 7372 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7373 THEN 'FILTERED' 7374 END 7375 ) 7376 """ 7377 ) 7378 if ( 7379 profile == default_profile 7380 and f"{pz_prefix}Flag" in list_of_pzfields 7381 ): 7382 sql_set_info.append( 7383 f""" 7384 concat( 7385 '{pz_prefix}Flag=', 7386 CASE 7387 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7388 THEN 'PASS' 7389 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7390 THEN 'FILTERED' 7391 END 7392 ) 7393 """ 7394 ) 7395 7396 # PZClass 7397 if ( 7398 f"{pz_prefix}Class{pzfields_sep}{profile}" 7399 in list_of_pzfields 7400 ): 7401 sql_set_info.append( 7402 f""" 7403 concat( 7404 '{pz_prefix}Class{pzfields_sep}{profile}=', 7405 CASE 7406 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7407 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7408 ELSE '.' 7409 END 7410 ) 7411 7412 """ 7413 ) 7414 if ( 7415 profile == default_profile 7416 and f"{pz_prefix}Class" in list_of_pzfields 7417 ): 7418 sql_set_info.append( 7419 f""" 7420 concat( 7421 '{pz_prefix}Class=', 7422 CASE 7423 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7424 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7425 ELSE '.' 7426 END 7427 ) 7428 """ 7429 ) 7430 7431 # PZComment 7432 if ( 7433 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7434 in list_of_pzfields 7435 ): 7436 sql_set_info.append( 7437 f""" 7438 CASE 7439 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7440 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7441 ELSE '' 7442 END 7443 """ 7444 ) 7445 if ( 7446 profile == default_profile 7447 and f"{pz_prefix}Comment" in list_of_pzfields 7448 ): 7449 sql_set_info.append( 7450 f""" 7451 CASE 7452 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7453 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7454 ELSE '' 7455 END 7456 """ 7457 ) 7458 7459 # PZInfos 7460 if ( 7461 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7462 in list_of_pzfields 7463 ): 7464 sql_set_info.append( 7465 f""" 7466 CASE 7467 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7468 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7469 ELSE '' 7470 END 7471 """ 7472 ) 7473 if ( 7474 profile == default_profile 7475 and f"{pz_prefix}Infos" in list_of_pzfields 7476 ): 7477 sql_set_info.append( 7478 f""" 7479 CASE 7480 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7481 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7482 ELSE '' 7483 END 7484 """ 7485 ) 7486 7487 # Merge PZfields 7488 sql_set_info_option = "" 7489 sql_set_sep = "" 7490 for sql_set in sql_set_info: 7491 if sql_set_sep: 7492 sql_set_info_option += f""" 7493 , concat('{sql_set_sep}', {sql_set}) 7494 """ 7495 else: 7496 sql_set_info_option += f""" 7497 , {sql_set} 7498 """ 7499 sql_set_sep = ";" 7500 7501 sql_queries = [] 7502 for annotation in prioritizations_config[profile]: 7503 7504 # skip special sections 7505 if annotation.startswith("_"): 7506 continue 7507 7508 # For each criterions 7509 for criterion in prioritizations_config[profile][ 7510 annotation 7511 ]: 7512 7513 # Criterion mode 7514 criterion_mode = None 7515 if np.any( 7516 np.isin(list(criterion.keys()), ["type", "value"]) 7517 ): 7518 criterion_mode = "operation" 7519 elif np.any( 7520 np.isin(list(criterion.keys()), ["sql", "fields"]) 7521 ): 7522 criterion_mode = "sql" 7523 log.debug(f"Criterion Mode: {criterion_mode}") 7524 7525 # Criterion parameters 7526 criterion_type = criterion.get("type", None) 7527 criterion_value = criterion.get("value", None) 7528 criterion_sql = criterion.get("sql", None) 7529 criterion_fields = criterion.get("fields", None) 7530 criterion_score = criterion.get("score", 0) 7531 criterion_flag = criterion.get("flag", "PASS") 7532 criterion_class = criterion.get("class", None) 7533 criterion_flag_bool = criterion_flag == "PASS" 7534 criterion_comment = ( 7535 ", ".join(criterion.get("comment", [])) 7536 .replace("'", "''") 7537 .replace(";", ",") 7538 .replace("\t", " ") 7539 ) 7540 criterion_infos = ( 7541 str(criterion) 7542 .replace("'", "''") 7543 .replace(";", ",") 7544 .replace("\t", " ") 7545 ) 7546 7547 # SQL 7548 if criterion_sql is not None and isinstance( 7549 criterion_sql, list 7550 ): 7551 criterion_sql = " ".join(criterion_sql) 7552 7553 # Fields and explode 7554 if criterion_fields is None: 7555 criterion_fields = [annotation] 7556 if not isinstance(criterion_fields, list): 7557 criterion_fields = str(criterion_fields).split(",") 7558 7559 # Class 7560 if criterion_class is not None and not isinstance( 7561 criterion_class, list 7562 ): 7563 criterion_class = str(criterion_class).split(",") 7564 7565 for annotation_field in criterion_fields: 7566 7567 # Explode specific annotation 7568 log.debug( 7569 f"Explode annotation '{annotation_field}'" 7570 ) 7571 added_columns += self.explode_infos( 7572 prefix=explode_infos_prefix, 7573 fields=[annotation_field], 7574 table=table_variants, 7575 ) 7576 extra_infos = self.get_extra_infos( 7577 table=table_variants 7578 ) 7579 7580 # Check if annotation field is present 7581 if ( 7582 f"{explode_infos_prefix}{annotation_field}" 7583 not in extra_infos 7584 ): 7585 msq_err = f"Annotation '{annotation_field}' not in data" 7586 log.error(msq_err) 7587 raise ValueError(msq_err) 7588 else: 7589 log.debug( 7590 f"Annotation '{annotation_field}' in data" 7591 ) 7592 7593 sql_set = [] 7594 sql_set_info = [] 7595 7596 # PZ fields set 7597 7598 # PZScore 7599 if ( 7600 f"{pz_prefix}Score{pzfields_sep}{profile}" 7601 in list_of_pzfields 7602 ): 7603 # VaRank prioritization score mode 7604 if prioritization_score_mode.upper().strip() in [ 7605 "VARANK", 7606 "MAX", 7607 "MAXIMUM", 7608 "TOP", 7609 ]: 7610 sql_set.append( 7611 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END " 7612 ) 7613 # default HOWARD prioritization score mode 7614 else: 7615 sql_set.append( 7616 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7617 ) 7618 7619 # PZFlag 7620 if ( 7621 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7622 in list_of_pzfields 7623 ): 7624 sql_set.append( 7625 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7626 ) 7627 7628 # PZClass 7629 if ( 7630 f"{pz_prefix}Class{pzfields_sep}{profile}" 7631 in list_of_pzfields 7632 and criterion_class is not None 7633 ): 7634 sql_set.append( 7635 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7636 ) 7637 7638 # PZComment 7639 if ( 7640 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7641 in list_of_pzfields 7642 ): 7643 sql_set.append( 7644 f""" 7645 {pz_prefix}Comment{pzfields_sep}{profile} = 7646 concat( 7647 {pz_prefix}Comment{pzfields_sep}{profile}, 7648 CASE 7649 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7650 THEN ', ' 7651 ELSE '' 7652 END, 7653 '{criterion_comment}' 7654 ) 7655 """ 7656 ) 7657 7658 # PZInfos 7659 if ( 7660 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7661 in list_of_pzfields 7662 ): 7663 sql_set.append( 7664 f""" 7665 {pz_prefix}Infos{pzfields_sep}{profile} = 7666 concat( 7667 {pz_prefix}Infos{pzfields_sep}{profile}, 7668 '{criterion_infos}' 7669 ) 7670 """ 7671 ) 7672 sql_set_option = ",".join(sql_set) 7673 7674 # Criterion and comparison 7675 if sql_set_option: 7676 7677 if criterion_mode in ["operation"]: 7678 7679 try: 7680 float(criterion_value) 7681 sql_update = f""" 7682 UPDATE {table_variants} 7683 SET {sql_set_option} 7684 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7685 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7686 """ 7687 except: 7688 contains_option = "" 7689 if criterion_type == "contains": 7690 contains_option = ".*" 7691 sql_update = f""" 7692 UPDATE {table_variants} 7693 SET {sql_set_option} 7694 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7695 """ 7696 sql_queries.append(sql_update) 7697 7698 elif criterion_mode in ["sql"]: 7699 7700 sql_update = f""" 7701 UPDATE {table_variants} 7702 SET {sql_set_option} 7703 WHERE {criterion_sql} 7704 """ 7705 sql_queries.append(sql_update) 7706 7707 else: 7708 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7709 log.error(msg_err) 7710 raise ValueError(msg_err) 7711 7712 else: 7713 log.warning( 7714 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7715 ) 7716 7717 # PZTags 7718 if ( 7719 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7720 in list_of_pzfields 7721 ): 7722 7723 # Create PZFalgs value 7724 pztags_value = "" 7725 pztags_sep_default = "," 7726 pztags_sep = "" 7727 for pzfield in pzfields: 7728 if pzfield not in [f"{pz_prefix}Tags"]: 7729 if ( 7730 f"{pzfield}{pzfields_sep}{profile}" 7731 in list_of_pzfields 7732 ): 7733 if pzfield in [f"{pz_prefix}Flag"]: 7734 pztags_value += f"""{pztags_sep}{pzfield}#', 7735 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7736 THEN 'PASS' 7737 ELSE 'FILTERED' 7738 END, '""" 7739 elif pzfield in [f"{pz_prefix}Class"]: 7740 pztags_value += f"""{pztags_sep}{pzfield}#', 7741 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7742 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7743 ELSE '.' 7744 END, '""" 7745 else: 7746 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7747 pztags_sep = pztags_sep_default 7748 7749 # Add Query update for PZFlags 7750 sql_update_pztags = f""" 7751 UPDATE {table_variants} 7752 SET INFO = concat( 7753 INFO, 7754 CASE WHEN INFO NOT in ('','.') 7755 THEN ';' 7756 ELSE '' 7757 END, 7758 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7759 ) 7760 """ 7761 sql_queries.append(sql_update_pztags) 7762 7763 # Add Query update for PZFlags for default 7764 if profile == default_profile: 7765 sql_update_pztags_default = f""" 7766 UPDATE {table_variants} 7767 SET INFO = concat( 7768 INFO, 7769 ';', 7770 '{pz_prefix}Tags={pztags_value}' 7771 ) 7772 """ 7773 sql_queries.append(sql_update_pztags_default) 7774 7775 log.info(f"""Profile '{profile}' - Prioritization... """) 7776 7777 if sql_queries: 7778 7779 for sql_query in sql_queries: 7780 log.debug( 7781 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7782 ) 7783 self.conn.execute(sql_query) 7784 7785 log.info(f"""Profile '{profile}' - Update... """) 7786 sql_query_update = f""" 7787 UPDATE {table_variants} 7788 SET INFO = 7789 concat( 7790 CASE 7791 WHEN INFO NOT IN ('','.') 7792 THEN concat(INFO, ';') 7793 ELSE '' 7794 END 7795 {sql_set_info_option} 7796 ) 7797 """ 7798 self.conn.execute(sql_query_update) 7799 7800 else: 7801 7802 log.warning(f"No profiles in parameters") 7803 7804 # Remove added columns 7805 for added_column in added_columns: 7806 self.drop_column(column=added_column) 7807 7808 # Explode INFOS fields into table fields 7809 if self.get_explode_infos(): 7810 self.explode_infos( 7811 prefix=self.get_explode_infos_prefix(), 7812 fields=self.get_explode_infos_fields(), 7813 force=True, 7814 ) 7815 7816 return True
The prioritization function in Python processes VCF files, adds new INFO fields, and
prioritizes variants based on configured profiles and criteria.
Parameters
- table: The
tableparameter in theprioritizationfunction is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table - pz_prefix: The
pz_prefixparameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ" - pz_param: The
pz_paramparameter in theprioritizationmethod is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns
A boolean value (True) is being returned from the
prioritizationfunction.
7822 def annotation_hgvs(self, threads: int = None) -> None: 7823 """ 7824 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7825 coordinates and alleles. 7826 7827 :param threads: The `threads` parameter is an optional integer that specifies the number of 7828 threads to use for parallel processing. If no value is provided, it will default to the number 7829 of threads obtained from the `get_threads()` method 7830 :type threads: int 7831 """ 7832 7833 # Function for each partition of the Dask Dataframe 7834 def partition_function(partition): 7835 """ 7836 The function `partition_function` applies the `annotation_hgvs_partition` function to 7837 each row of a DataFrame called `partition`. 7838 7839 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7840 to be processed 7841 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7842 the "partition" dataframe along the axis 1. 7843 """ 7844 return partition.apply(annotation_hgvs_partition, axis=1) 7845 7846 def annotation_hgvs_partition(row) -> str: 7847 """ 7848 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7849 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7850 7851 :param row: A dictionary-like object that contains the values for the following keys: 7852 :return: a string that contains the HGVS names associated with the given row of data. 7853 """ 7854 7855 chr = row["CHROM"] 7856 pos = row["POS"] 7857 ref = row["REF"] 7858 alt = row["ALT"] 7859 7860 # Find list of associated transcripts 7861 transcripts_list = list( 7862 polars_conn.execute( 7863 f""" 7864 SELECT transcript 7865 FROM refseq_df 7866 WHERE CHROM='{chr}' 7867 AND POS={pos} 7868 """ 7869 )["transcript"] 7870 ) 7871 7872 # Full HGVS annotation in list 7873 hgvs_full_list = [] 7874 7875 for transcript_name in transcripts_list: 7876 7877 # Transcript 7878 transcript = get_transcript( 7879 transcripts=transcripts, transcript_name=transcript_name 7880 ) 7881 # Exon 7882 if use_exon: 7883 exon = transcript.find_exon_number(pos) 7884 else: 7885 exon = None 7886 # Protein 7887 transcript_protein = None 7888 if use_protein or add_protein or full_format: 7889 transcripts_protein = list( 7890 polars_conn.execute( 7891 f""" 7892 SELECT protein 7893 FROM refseqlink_df 7894 WHERE transcript='{transcript_name}' 7895 LIMIT 1 7896 """ 7897 )["protein"] 7898 ) 7899 if len(transcripts_protein): 7900 transcript_protein = transcripts_protein[0] 7901 7902 # HGVS name 7903 hgvs_name = format_hgvs_name( 7904 chr, 7905 pos, 7906 ref, 7907 alt, 7908 genome=genome, 7909 transcript=transcript, 7910 transcript_protein=transcript_protein, 7911 exon=exon, 7912 use_gene=use_gene, 7913 use_protein=use_protein, 7914 full_format=full_format, 7915 use_version=use_version, 7916 codon_type=codon_type, 7917 ) 7918 hgvs_full_list.append(hgvs_name) 7919 if add_protein and not use_protein and not full_format: 7920 hgvs_name = format_hgvs_name( 7921 chr, 7922 pos, 7923 ref, 7924 alt, 7925 genome=genome, 7926 transcript=transcript, 7927 transcript_protein=transcript_protein, 7928 exon=exon, 7929 use_gene=use_gene, 7930 use_protein=True, 7931 full_format=False, 7932 use_version=use_version, 7933 codon_type=codon_type, 7934 ) 7935 hgvs_full_list.append(hgvs_name) 7936 7937 # Create liste of HGVS annotations 7938 hgvs_full = ",".join(hgvs_full_list) 7939 7940 return hgvs_full 7941 7942 # Polars connexion 7943 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7944 7945 # Config 7946 config = self.get_config() 7947 7948 # Databases 7949 # Genome 7950 databases_genomes_folders = ( 7951 config.get("folders", {}) 7952 .get("databases", {}) 7953 .get("genomes", DEFAULT_GENOME_FOLDER) 7954 ) 7955 databases_genome = ( 7956 config.get("folders", {}).get("databases", {}).get("genomes", "") 7957 ) 7958 # refseq database folder 7959 databases_refseq_folders = ( 7960 config.get("folders", {}) 7961 .get("databases", {}) 7962 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7963 ) 7964 # refseq 7965 databases_refseq = config.get("databases", {}).get("refSeq", None) 7966 # refSeqLink 7967 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7968 7969 # Param 7970 param = self.get_param() 7971 7972 # Quick HGVS 7973 if "hgvs_options" in param and param.get("hgvs_options", ""): 7974 log.info(f"Quick HGVS Annotation:") 7975 if not param.get("hgvs", None): 7976 param["hgvs"] = {} 7977 for option in param.get("hgvs_options", "").split(","): 7978 option_var_val = option.split("=") 7979 option_var = option_var_val[0] 7980 if len(option_var_val) > 1: 7981 option_val = option_var_val[1] 7982 else: 7983 option_val = "True" 7984 if option_val.upper() in ["TRUE"]: 7985 option_val = True 7986 elif option_val.upper() in ["FALSE"]: 7987 option_val = False 7988 log.info(f" {option_var}={option_val}") 7989 param["hgvs"][option_var] = option_val 7990 7991 # Check if HGVS annotation enabled 7992 if "hgvs" in param: 7993 log.info(f"HGVS Annotation... ") 7994 for hgvs_option in param.get("hgvs", {}): 7995 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7996 else: 7997 return 7998 7999 # HGVS Param 8000 param_hgvs = param.get("hgvs", {}) 8001 use_exon = param_hgvs.get("use_exon", False) 8002 use_gene = param_hgvs.get("use_gene", False) 8003 use_protein = param_hgvs.get("use_protein", False) 8004 add_protein = param_hgvs.get("add_protein", False) 8005 full_format = param_hgvs.get("full_format", False) 8006 use_version = param_hgvs.get("use_version", False) 8007 codon_type = param_hgvs.get("codon_type", "3") 8008 8009 # refSseq refSeqLink 8010 databases_refseq = param_hgvs.get("refseq", databases_refseq) 8011 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 8012 8013 # Assembly 8014 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 8015 8016 # Genome 8017 genome_file = None 8018 if find_genome(databases_genome): 8019 genome_file = find_genome(databases_genome) 8020 else: 8021 genome_file = find_genome( 8022 genome_path=databases_genomes_folders, assembly=assembly 8023 ) 8024 log.debug("Genome: " + str(genome_file)) 8025 8026 # refSseq 8027 refseq_file = find_file_prefix( 8028 input_file=databases_refseq, 8029 prefix="ncbiRefSeq", 8030 folder=databases_refseq_folders, 8031 assembly=assembly, 8032 ) 8033 log.debug("refSeq: " + str(refseq_file)) 8034 8035 # refSeqLink 8036 refseqlink_file = find_file_prefix( 8037 input_file=databases_refseqlink, 8038 prefix="ncbiRefSeqLink", 8039 folder=databases_refseq_folders, 8040 assembly=assembly, 8041 ) 8042 log.debug("refSeqLink: " + str(refseqlink_file)) 8043 8044 # Threads 8045 if not threads: 8046 threads = self.get_threads() 8047 log.debug("Threads: " + str(threads)) 8048 8049 # Variables 8050 table_variants = self.get_table_variants(clause="update") 8051 8052 # Get variants SNV and InDel only 8053 query_variants = f""" 8054 SELECT "#CHROM" AS CHROM, POS, REF, ALT 8055 FROM {table_variants} 8056 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 8057 """ 8058 df_variants = self.get_query_to_df(query_variants) 8059 8060 # Added columns 8061 added_columns = [] 8062 8063 # Add hgvs column in variants table 8064 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 8065 added_column = self.add_column( 8066 table_variants, hgvs_column_name, "STRING", default_value=None 8067 ) 8068 added_columns.append(added_column) 8069 8070 log.debug(f"refSeq loading...") 8071 # refSeq in duckDB 8072 refseq_table = get_refseq_table( 8073 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 8074 ) 8075 # Loading all refSeq in Dataframe 8076 refseq_query = f""" 8077 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 8078 FROM {refseq_table} 8079 JOIN df_variants ON ( 8080 {refseq_table}.chrom = df_variants.CHROM 8081 AND {refseq_table}.txStart<=df_variants.POS 8082 AND {refseq_table}.txEnd>=df_variants.POS 8083 ) 8084 """ 8085 refseq_df = self.conn.query(refseq_query).pl() 8086 8087 if refseqlink_file: 8088 log.debug(f"refSeqLink loading...") 8089 # refSeqLink in duckDB 8090 refseqlink_table = get_refseq_table( 8091 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 8092 ) 8093 # Loading all refSeqLink in Dataframe 8094 protacc_column = "protAcc_with_ver" 8095 mrnaacc_column = "mrnaAcc_with_ver" 8096 refseqlink_query = f""" 8097 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 8098 FROM {refseqlink_table} 8099 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 8100 WHERE protAcc_without_ver IS NOT NULL 8101 """ 8102 # Polars Dataframe 8103 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 8104 8105 # Read RefSeq transcripts into a python dict/model. 8106 log.debug(f"Transcripts loading...") 8107 with tempfile.TemporaryDirectory() as tmpdir: 8108 transcripts_query = f""" 8109 COPY ( 8110 SELECT {refseq_table}.* 8111 FROM {refseq_table} 8112 JOIN df_variants ON ( 8113 {refseq_table}.chrom=df_variants.CHROM 8114 AND {refseq_table}.txStart<=df_variants.POS 8115 AND {refseq_table}.txEnd>=df_variants.POS 8116 ) 8117 ) 8118 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 8119 """ 8120 self.conn.query(transcripts_query) 8121 with open(f"{tmpdir}/transcript.tsv") as infile: 8122 transcripts = read_transcripts(infile) 8123 8124 # Polars connexion 8125 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8126 8127 log.debug("Genome loading...") 8128 # Read genome sequence using pyfaidx. 8129 genome = Fasta(genome_file) 8130 8131 log.debug("Start annotation HGVS...") 8132 8133 # Create 8134 # a Dask Dataframe from Pandas dataframe with partition as number of threads 8135 ddf = dd.from_pandas(df_variants, npartitions=threads) 8136 8137 # Use dask.dataframe.apply() to apply function on each partition 8138 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 8139 8140 # Convert Dask DataFrame to Pandas Dataframe 8141 df = ddf.compute() 8142 8143 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 8144 with tempfile.TemporaryDirectory() as tmpdir: 8145 df_parquet = os.path.join(tmpdir, "df.parquet") 8146 df.to_parquet(df_parquet) 8147 8148 # Update hgvs column 8149 update_variant_query = f""" 8150 UPDATE {table_variants} 8151 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 8152 FROM read_parquet('{df_parquet}') as df 8153 WHERE variants."#CHROM" = df.CHROM 8154 AND variants.POS = df.POS 8155 AND variants.REF = df.REF 8156 AND variants.ALT = df.ALT 8157 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 8158 """ 8159 self.execute_query(update_variant_query) 8160 8161 # Update INFO column 8162 sql_query_update = f""" 8163 UPDATE {table_variants} 8164 SET INFO = 8165 concat( 8166 CASE 8167 WHEN INFO NOT IN ('','.') 8168 THEN concat(INFO, ';') 8169 ELSE '' 8170 END, 8171 'hgvs=', 8172 {hgvs_column_name} 8173 ) 8174 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 8175 """ 8176 self.execute_query(sql_query_update) 8177 8178 # Add header 8179 HGVS_INFOS = { 8180 "hgvs": { 8181 "ID": "hgvs", 8182 "Number": ".", 8183 "Type": "String", 8184 "Description": f"HGVS annotatation with HOWARD", 8185 } 8186 } 8187 8188 for field in HGVS_INFOS: 8189 field_ID = HGVS_INFOS[field]["ID"] 8190 field_description = HGVS_INFOS[field]["Description"] 8191 self.get_header().infos[field_ID] = vcf.parser._Info( 8192 field_ID, 8193 HGVS_INFOS[field]["Number"], 8194 HGVS_INFOS[field]["Type"], 8195 field_description, 8196 "unknown", 8197 "unknown", 8198 code_type_map[HGVS_INFOS[field]["Type"]], 8199 ) 8200 8201 # Remove added columns 8202 for added_column in added_columns: 8203 self.drop_column(column=added_column)
The annotation_hgvs function performs HGVS annotation on a set of variants using genomic
coordinates and alleles.
Parameters
- threads: The
threadsparameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from theget_threads()method
8209 def get_operations_help( 8210 self, operations_config_dict: dict = {}, operations_config_file: str = None 8211 ) -> list: 8212 8213 # Init 8214 operations_help = [] 8215 8216 # operations 8217 operations = self.get_config_json( 8218 name="calculations", 8219 config_dict=operations_config_dict, 8220 config_file=operations_config_file, 8221 ) 8222 for op in operations: 8223 op_name = operations[op].get("name", op).upper() 8224 op_description = operations[op].get("description", op_name) 8225 op_available = operations[op].get("available", False) 8226 if op_available: 8227 operations_help.append(f" {op_name}: {op_description}") 8228 8229 # Sort operations 8230 operations_help.sort() 8231 8232 # insert header 8233 operations_help.insert(0, "Available calculation operations:") 8234 8235 # Return 8236 return operations_help
8238 def calculation( 8239 self, 8240 operations: dict = {}, 8241 operations_config_dict: dict = {}, 8242 operations_config_file: str = None, 8243 ) -> None: 8244 """ 8245 It takes a list of operations, and for each operation, it checks if it's a python or sql 8246 operation, and then calls the appropriate function 8247 8248 param json example: 8249 "calculation": { 8250 "NOMEN": { 8251 "options": { 8252 "hgvs_field": "hgvs" 8253 }, 8254 "middle" : null 8255 } 8256 """ 8257 8258 # Param 8259 param = self.get_param() 8260 8261 # CHeck operations config file 8262 if operations_config_file is None: 8263 operations_config_file = param.get("calculation", {}).get( 8264 "calculation_config", None 8265 ) 8266 8267 # operations config 8268 operations_config = self.get_config_json( 8269 name="calculations", 8270 config_dict=operations_config_dict, 8271 config_file=operations_config_file, 8272 ) 8273 8274 # Upper keys 8275 operations_config = {k.upper(): v for k, v in operations_config.items()} 8276 8277 # Calculations 8278 8279 # Operations from param 8280 operations = param.get("calculation", {}).get("calculations", operations) 8281 8282 # Quick calculation - add 8283 if param.get("calculations", None): 8284 8285 # List of operations 8286 calculations_list = [ 8287 value.strip() for value in param.get("calculations", "").split(",") 8288 ] 8289 8290 # Log 8291 log.info(f"Quick Calculations:") 8292 for calculation_key in calculations_list: 8293 log.info(f" {calculation_key}") 8294 8295 # Create tmp operations (to keep operation order) 8296 operations_tmp = {} 8297 for calculation_operation in calculations_list: 8298 if calculation_operation.upper() not in operations_tmp: 8299 log.debug( 8300 f"{calculation_operation}.upper() not in {operations_tmp}" 8301 ) 8302 operations_tmp[calculation_operation.upper()] = {} 8303 add_value_into_dict( 8304 dict_tree=operations_tmp, 8305 sections=[ 8306 calculation_operation.upper(), 8307 ], 8308 value=operations.get(calculation_operation.upper(), {}), 8309 ) 8310 # Add operations already in param 8311 for calculation_operation in operations: 8312 if calculation_operation not in operations_tmp: 8313 operations_tmp[calculation_operation] = operations.get( 8314 calculation_operation, {} 8315 ) 8316 8317 # Update operations in param 8318 operations = operations_tmp 8319 8320 # Operations for calculation 8321 if not operations: 8322 operations = param.get("calculation", {}).get("calculations", {}) 8323 8324 if operations: 8325 log.info(f"Calculations...") 8326 8327 # For each operations 8328 for operation_name in operations: 8329 operation_name = operation_name.upper() 8330 if operation_name not in [""]: 8331 if operation_name in operations_config: 8332 log.info(f"Calculation '{operation_name}'") 8333 operation = operations_config[operation_name] 8334 operation_type = operation.get("type", "sql") 8335 if operation_type == "python": 8336 self.calculation_process_function( 8337 operation=operation, operation_name=operation_name 8338 ) 8339 elif operation_type == "sql": 8340 self.calculation_process_sql( 8341 operation=operation, operation_name=operation_name 8342 ) 8343 else: 8344 log.error( 8345 f"Operations config: Type '{operation_type}' NOT available" 8346 ) 8347 raise ValueError( 8348 f"Operations config: Type '{operation_type}' NOT available" 8349 ) 8350 else: 8351 log.error( 8352 f"Operations config: Calculation '{operation_name}' NOT available" 8353 ) 8354 raise ValueError( 8355 f"Operations config: Calculation '{operation_name}' NOT available" 8356 ) 8357 8358 # Explode INFOS fields into table fields 8359 if self.get_explode_infos(): 8360 self.explode_infos( 8361 prefix=self.get_explode_infos_prefix(), 8362 fields=self.get_explode_infos_fields(), 8363 force=True, 8364 )
It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function
param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }
8366 def calculation_process_sql( 8367 self, operation: dict, operation_name: str = "unknown" 8368 ) -> None: 8369 """ 8370 The `calculation_process_sql` function takes in a mathematical operation as a string and 8371 performs the operation, updating the specified table with the result. 8372 8373 :param operation: The `operation` parameter is a dictionary that contains information about the 8374 mathematical operation to be performed. It includes the following keys: 8375 :type operation: dict 8376 :param operation_name: The `operation_name` parameter is a string that represents the name of 8377 the mathematical operation being performed. It is used for logging and error handling purposes, 8378 defaults to unknown 8379 :type operation_name: str (optional) 8380 """ 8381 8382 # Operation infos 8383 operation_name = operation.get("name", "unknown") 8384 log.debug(f"process SQL {operation_name}") 8385 output_column_name = operation.get("output_column_name", operation_name) 8386 output_column_type = operation.get("output_column_type", "String") 8387 prefix = operation.get("explode_infos_prefix", "") 8388 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8389 output_column_description = operation.get( 8390 "output_column_description", f"{operation_name} operation" 8391 ) 8392 operation_query = operation.get("operation_query", None) 8393 if isinstance(operation_query, list): 8394 operation_query = " ".join(operation_query) 8395 operation_info_fields = operation.get("info_fields", []) 8396 operation_info_fields_check = operation.get("info_fields_check", False) 8397 operation_info = operation.get("operation_info", True) 8398 operation_table = operation.get( 8399 "table", self.get_table_variants(clause="alter") 8400 ) 8401 8402 # table variants 8403 if operation_table: 8404 table_variants = operation_table 8405 else: 8406 table_variants = self.get_table_variants(clause="alter") 8407 8408 if operation_query: 8409 8410 # Info fields check 8411 operation_info_fields_check_result = True 8412 if operation_info_fields_check: 8413 header_infos = self.get_header().infos 8414 for info_field in operation_info_fields: 8415 operation_info_fields_check_result = ( 8416 operation_info_fields_check_result 8417 and info_field in header_infos 8418 ) 8419 8420 # If info fields available 8421 if operation_info_fields_check_result: 8422 8423 # Added_columns 8424 added_columns = [] 8425 8426 # Create VCF header field 8427 vcf_reader = self.get_header() 8428 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8429 output_column_name, 8430 ".", 8431 output_column_type, 8432 output_column_description, 8433 "howard calculation", 8434 "0", 8435 self.code_type_map.get(output_column_type), 8436 ) 8437 8438 # Explode infos if needed 8439 log.debug(f"calculation_process_sql prefix {prefix}") 8440 added_columns += self.explode_infos( 8441 prefix=prefix, 8442 fields=[output_column_name] + operation_info_fields, 8443 force=False, 8444 table=table_variants, 8445 ) 8446 8447 # Create column 8448 added_column = self.add_column( 8449 table_name=table_variants, 8450 column_name=prefix + output_column_name, 8451 column_type=output_column_type_sql, 8452 default_value="null", 8453 ) 8454 added_columns.append(added_column) 8455 8456 # Operation calculation 8457 try: 8458 8459 # Query to update calculation column 8460 sql_update = f""" 8461 UPDATE {table_variants} 8462 SET "{prefix}{output_column_name}" = ({operation_query}) 8463 """ 8464 self.conn.execute(sql_update) 8465 8466 # Add to INFO 8467 if operation_info: 8468 sql_update_info = f""" 8469 UPDATE {table_variants} 8470 SET "INFO" = 8471 concat( 8472 CASE 8473 WHEN "INFO" IS NOT NULL 8474 THEN concat("INFO", ';') 8475 ELSE '' 8476 END, 8477 '{output_column_name}=', 8478 "{prefix}{output_column_name}" 8479 ) 8480 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8481 """ 8482 self.conn.execute(sql_update_info) 8483 8484 except: 8485 log.error( 8486 f"Operations config: Calculation '{operation_name}' query failed" 8487 ) 8488 raise ValueError( 8489 f"Operations config: Calculation '{operation_name}' query failed" 8490 ) 8491 8492 # Remove added columns 8493 for added_column in added_columns: 8494 log.debug(f"added_column: {added_column}") 8495 self.drop_column(column=added_column) 8496 8497 else: 8498 log.error( 8499 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8500 ) 8501 raise ValueError( 8502 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8503 ) 8504 8505 else: 8506 log.error( 8507 f"Operations config: Calculation '{operation_name}' query NOT defined" 8508 ) 8509 raise ValueError( 8510 f"Operations config: Calculation '{operation_name}' query NOT defined" 8511 )
The calculation_process_sql function takes in a mathematical operation as a string and
performs the operation, updating the specified table with the result.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
8513 def calculation_process_function( 8514 self, operation: dict, operation_name: str = "unknown" 8515 ) -> None: 8516 """ 8517 The `calculation_process_function` takes in an operation dictionary and performs the specified 8518 function with the given parameters. 8519 8520 :param operation: The `operation` parameter is a dictionary that contains information about the 8521 operation to be performed. It has the following keys: 8522 :type operation: dict 8523 :param operation_name: The `operation_name` parameter is a string that represents the name of 8524 the operation being performed. It is used for logging purposes, defaults to unknown 8525 :type operation_name: str (optional) 8526 """ 8527 8528 operation_name = operation["name"] 8529 log.debug(f"process Python {operation_name}") 8530 function_name = operation["function_name"] 8531 function_params = operation["function_params"] 8532 getattr(self, function_name)(*function_params)
The calculation_process_function takes in an operation dictionary and performs the specified
function with the given parameters.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the operation to be performed. It has the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
8534 def calculation_variant_id(self) -> None: 8535 """ 8536 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8537 updates the INFO field of a variants table with the variant ID. 8538 """ 8539 8540 # variant_id annotation field 8541 variant_id_tag = self.get_variant_id_column() 8542 added_columns = [variant_id_tag] 8543 8544 # variant_id hgvs tags" 8545 vcf_infos_tags = { 8546 variant_id_tag: "howard variant ID annotation", 8547 } 8548 8549 # Variants table 8550 table_variants = self.get_table_variants() 8551 8552 # Header 8553 vcf_reader = self.get_header() 8554 8555 # Add variant_id to header 8556 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8557 variant_id_tag, 8558 ".", 8559 "String", 8560 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8561 "howard calculation", 8562 "0", 8563 self.code_type_map.get("String"), 8564 ) 8565 8566 # Update 8567 sql_update = f""" 8568 UPDATE {table_variants} 8569 SET "INFO" = 8570 concat( 8571 CASE 8572 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8573 THEN '' 8574 ELSE concat("INFO", ';') 8575 END, 8576 '{variant_id_tag}=', 8577 "{variant_id_tag}" 8578 ) 8579 """ 8580 self.conn.execute(sql_update) 8581 8582 # Remove added columns 8583 for added_column in added_columns: 8584 self.drop_column(column=added_column)
The function calculation_variant_id adds a variant ID annotation to a VCF file header and
updates the INFO field of a variants table with the variant ID.
8586 def calculation_extract_snpeff_hgvs( 8587 self, 8588 snpeff_hgvs: str = "snpeff_hgvs", 8589 snpeff_field: str = "ANN", 8590 ) -> None: 8591 """ 8592 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8593 annotation field in a VCF file and adds them as a new column in the variants table. 8594 8595 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8596 function is used to specify the name of the column that will store the HGVS nomenclatures 8597 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8598 snpeff_hgvs 8599 :type snpeff_hgvs: str (optional) 8600 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8601 function represents the field in the VCF file that contains SnpEff annotations. This field is 8602 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8603 to ANN 8604 :type snpeff_field: str (optional) 8605 """ 8606 8607 # Snpeff hgvs tags 8608 vcf_infos_tags = { 8609 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8610 } 8611 8612 # Prefix 8613 prefix = self.get_explode_infos_prefix() 8614 if prefix: 8615 prefix = "INFO/" 8616 8617 # snpEff fields 8618 speff_ann_infos = prefix + snpeff_field 8619 speff_hgvs_infos = prefix + snpeff_hgvs 8620 8621 # Variants table 8622 table_variants = self.get_table_variants() 8623 8624 # Header 8625 vcf_reader = self.get_header() 8626 8627 # Add columns 8628 added_columns = [] 8629 8630 # Explode HGVS field in column 8631 added_columns += self.explode_infos(fields=[snpeff_field]) 8632 8633 if snpeff_field in vcf_reader.infos: 8634 8635 log.debug(vcf_reader.infos[snpeff_field]) 8636 8637 # Extract ANN header 8638 ann_description = vcf_reader.infos[snpeff_field].desc 8639 pattern = r"'(.+?)'" 8640 match = re.search(pattern, ann_description) 8641 if match: 8642 ann_header_match = match.group(1).split(" | ") 8643 ann_header_desc = {} 8644 for i in range(len(ann_header_match)): 8645 ann_header_info = "".join( 8646 char for char in ann_header_match[i] if char.isalnum() 8647 ) 8648 ann_header_desc[ann_header_info] = ann_header_match[i] 8649 if not ann_header_desc: 8650 raise ValueError("Invalid header description format") 8651 else: 8652 raise ValueError("Invalid header description format") 8653 8654 # Create variant id 8655 variant_id_column = self.get_variant_id_column() 8656 added_columns += [variant_id_column] 8657 8658 # Create dataframe 8659 dataframe_snpeff_hgvs = self.get_query_to_df( 8660 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8661 ) 8662 8663 # Create main NOMEN column 8664 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8665 speff_ann_infos 8666 ].apply( 8667 lambda x: extract_snpeff_hgvs( 8668 str(x), header=list(ann_header_desc.values()) 8669 ) 8670 ) 8671 8672 # Add snpeff_hgvs to header 8673 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8674 snpeff_hgvs, 8675 ".", 8676 "String", 8677 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8678 "howard calculation", 8679 "0", 8680 self.code_type_map.get("String"), 8681 ) 8682 8683 # Update 8684 sql_update = f""" 8685 UPDATE variants 8686 SET "INFO" = 8687 concat( 8688 CASE 8689 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8690 THEN '' 8691 ELSE concat("INFO", ';') 8692 END, 8693 CASE 8694 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8695 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8696 THEN concat( 8697 '{snpeff_hgvs}=', 8698 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8699 ) 8700 ELSE '' 8701 END 8702 ) 8703 FROM dataframe_snpeff_hgvs 8704 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8705 8706 """ 8707 self.conn.execute(sql_update) 8708 8709 # Delete dataframe 8710 del dataframe_snpeff_hgvs 8711 gc.collect() 8712 8713 else: 8714 8715 log.warning( 8716 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8717 ) 8718 8719 # Remove added columns 8720 for added_column in added_columns: 8721 self.drop_column(column=added_column)
The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff
annotation field in a VCF file and adds them as a new column in the variants table.
Parameters
- snpeff_hgvs: The
snpeff_hgvsparameter in thecalculation_extract_snpeff_hgvsfunction is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs - snpeff_field: The
snpeff_fieldparameter in thecalculation_extract_snpeff_hgvsfunction represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
8723 def calculation_snpeff_ann_explode( 8724 self, 8725 uniquify: bool = True, 8726 output_format: str = "fields", 8727 output_prefix: str = "snpeff_", 8728 snpeff_field: str = "ANN", 8729 ) -> None: 8730 """ 8731 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8732 exploding the HGVS field and updating variant information accordingly. 8733 8734 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8735 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8736 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8737 defaults to True 8738 :type uniquify: bool (optional) 8739 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8740 function specifies the format in which the output annotations will be generated. It has a 8741 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8742 format, defaults to fields 8743 :type output_format: str (optional) 8744 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8745 method is used to specify the prefix that will be added to the output annotations generated 8746 during the calculation process. This prefix helps to differentiate the newly added annotations 8747 from existing ones in the output data. By default, the, defaults to ANN_ 8748 :type output_prefix: str (optional) 8749 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8750 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8751 field will be processed to explode the HGVS annotations and update the variant information 8752 accordingly, defaults to ANN 8753 :type snpeff_field: str (optional) 8754 """ 8755 8756 # SnpEff annotation field 8757 snpeff_hgvs = "snpeff_ann_explode" 8758 8759 # Snpeff hgvs tags 8760 vcf_infos_tags = { 8761 snpeff_hgvs: "Explode snpEff annotations", 8762 } 8763 8764 # Prefix 8765 prefix = self.get_explode_infos_prefix() 8766 if prefix: 8767 prefix = "INFO/" 8768 8769 # snpEff fields 8770 speff_ann_infos = prefix + snpeff_field 8771 speff_hgvs_infos = prefix + snpeff_hgvs 8772 8773 # Variants table 8774 table_variants = self.get_table_variants() 8775 8776 # Header 8777 vcf_reader = self.get_header() 8778 8779 # Add columns 8780 added_columns = [] 8781 8782 # Explode HGVS field in column 8783 added_columns += self.explode_infos(fields=[snpeff_field]) 8784 log.debug(f"snpeff_field={snpeff_field}") 8785 log.debug(f"added_columns={added_columns}") 8786 8787 if snpeff_field in vcf_reader.infos: 8788 8789 # Extract ANN header 8790 ann_description = vcf_reader.infos[snpeff_field].desc 8791 pattern = r"'(.+?)'" 8792 match = re.search(pattern, ann_description) 8793 if match: 8794 ann_header_match = match.group(1).split(" | ") 8795 ann_header = [] 8796 ann_header_desc = {} 8797 for i in range(len(ann_header_match)): 8798 ann_header_info = "".join( 8799 char for char in ann_header_match[i] if char.isalnum() 8800 ) 8801 ann_header.append(ann_header_info) 8802 ann_header_desc[ann_header_info] = ann_header_match[i] 8803 if not ann_header_desc: 8804 raise ValueError("Invalid header description format") 8805 else: 8806 raise ValueError("Invalid header description format") 8807 8808 # Create variant id 8809 variant_id_column = self.get_variant_id_column() 8810 added_columns += [variant_id_column] 8811 8812 # Create dataframe 8813 dataframe_snpeff_hgvs = self.get_query_to_df( 8814 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8815 ) 8816 8817 # Create snpEff columns 8818 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8819 speff_ann_infos 8820 ].apply( 8821 lambda x: explode_snpeff_ann( 8822 str(x), 8823 uniquify=uniquify, 8824 output_format=output_format, 8825 prefix=output_prefix, 8826 header=list(ann_header_desc.values()), 8827 ) 8828 ) 8829 8830 # Header 8831 ann_annotations_prefix = "" 8832 if output_format.upper() in ["JSON"]: 8833 ann_annotations_prefix = f"{output_prefix}=" 8834 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8835 output_prefix, 8836 ".", 8837 "String", 8838 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8839 + " - JSON format", 8840 "howard calculation", 8841 "0", 8842 self.code_type_map.get("String"), 8843 ) 8844 else: 8845 for ann_annotation in ann_header: 8846 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8847 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8848 ann_annotation_id, 8849 ".", 8850 "String", 8851 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8852 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8853 "howard calculation", 8854 "0", 8855 self.code_type_map.get("String"), 8856 ) 8857 8858 # Update 8859 sql_update = f""" 8860 UPDATE variants 8861 SET "INFO" = 8862 concat( 8863 CASE 8864 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8865 THEN '' 8866 ELSE concat("INFO", ';') 8867 END, 8868 CASE 8869 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8870 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8871 THEN concat( 8872 '{ann_annotations_prefix}', 8873 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8874 ) 8875 ELSE '' 8876 END 8877 ) 8878 FROM dataframe_snpeff_hgvs 8879 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8880 8881 """ 8882 self.conn.execute(sql_update) 8883 8884 # Delete dataframe 8885 del dataframe_snpeff_hgvs 8886 gc.collect() 8887 8888 else: 8889 8890 log.warning( 8891 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8892 ) 8893 8894 # Remove added columns 8895 for added_column in added_columns: 8896 self.drop_column(column=added_column)
The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by
exploding the HGVS field and updating variant information accordingly.
Parameters
- uniquify: The
uniquifyparameter in thecalculation_snpeff_ann_explodemethod is a boolean flag that determines whether the output should be uniquified or not. When set toTrue, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True - output_format: The
output_formatparameter in thecalculation_snpeff_ann_explodefunction specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields - output_prefix: The
output_prefixparameter in thecalculation_snpeff_ann_explodemethod is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_ - snpeff_field: The
snpeff_fieldparameter in thecalculation_snpeff_ann_explodefunction is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
8898 def calculation_extract_nomen(self) -> None: 8899 """ 8900 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8901 """ 8902 8903 # NOMEN field 8904 field_nomen_dict = "NOMEN_DICT" 8905 8906 # NOMEN structure 8907 nomen_dict = { 8908 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8909 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8910 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8911 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8912 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8913 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8914 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8915 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8916 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8917 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8918 } 8919 8920 # Param 8921 param = self.get_param() 8922 8923 # Threads 8924 threads = self.get_threads() 8925 8926 # Prefix 8927 prefix = self.get_explode_infos_prefix() 8928 8929 # Header 8930 vcf_reader = self.get_header() 8931 8932 # Added columns 8933 added_columns = [] 8934 8935 # Get HGVS field 8936 hgvs_field = ( 8937 param.get("calculation", {}) 8938 .get("calculations", {}) 8939 .get("NOMEN", {}) 8940 .get("options", {}) 8941 .get("hgvs_field", "hgvs") 8942 ) 8943 8944 # Get NOMEN pattern 8945 nomen_pattern = ( 8946 param.get("calculation", {}) 8947 .get("calculations", {}) 8948 .get("NOMEN", {}) 8949 .get("options", {}) 8950 .get("pattern", None) 8951 ) 8952 8953 # transcripts list of preference sources 8954 transcripts_sources = {} 8955 8956 # Get transcripts 8957 transcripts_file = ( 8958 param.get("calculation", {}) 8959 .get("calculations", {}) 8960 .get("NOMEN", {}) 8961 .get("options", {}) 8962 .get("transcripts", None) 8963 ) 8964 transcripts_file = full_path(transcripts_file) 8965 if transcripts_file: 8966 if os.path.exists(transcripts_file): 8967 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8968 transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist() 8969 transcripts_sources["file"] = transcripts_from_file 8970 else: 8971 msg_err = f"Transcript file '{transcripts_file}' does NOT exist" 8972 log.error(msg_err) 8973 raise ValueError(msg_err) 8974 8975 # Get transcripts table 8976 transcripts_table = ( 8977 param.get("calculation", {}) 8978 .get("calculations", {}) 8979 .get("NOMEN", {}) 8980 .get("options", {}) 8981 .get("transcripts_table", self.get_table_variants()) 8982 ) 8983 # Get transcripts column 8984 transcripts_column = ( 8985 param.get("calculation", {}) 8986 .get("calculations", {}) 8987 .get("NOMEN", {}) 8988 .get("options", {}) 8989 .get("transcripts_column", None) 8990 ) 8991 8992 if transcripts_table and transcripts_column: 8993 extra_field_transcript = f"{transcripts_table}.{transcripts_column}" 8994 # Explode if not exists 8995 added_columns += self.explode_infos( 8996 fields=[transcripts_column], table=transcripts_table 8997 ) 8998 else: 8999 extra_field_transcript = f"NULL" 9000 9001 # Transcripts of preference source order 9002 transcripts_order = ( 9003 param.get("calculation", {}) 9004 .get("calculations", {}) 9005 .get("NOMEN", {}) 9006 .get("options", {}) 9007 .get("transcripts_order", ["column", "file"]) 9008 ) 9009 9010 # Transcripts from file 9011 transcripts = transcripts_sources.get("file", []) 9012 9013 # Explode HGVS field in column 9014 added_columns += self.explode_infos(fields=[hgvs_field]) 9015 9016 # extra infos 9017 extra_infos = self.get_extra_infos() 9018 extra_field = prefix + hgvs_field 9019 9020 if extra_field in extra_infos: 9021 9022 # Create dataframe 9023 dataframe_hgvs = self.get_query_to_df( 9024 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """ 9025 ) 9026 9027 # Transcripts rank 9028 transcripts_rank = { 9029 transcript: rank for rank, transcript in enumerate(transcripts, start=1) 9030 } 9031 transcripts_len = len(transcripts_rank) 9032 9033 # Create main NOMEN column 9034 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply( 9035 lambda x: find_nomen( 9036 hgvs=x.hgvs, 9037 transcript=x.transcript, 9038 transcripts=transcripts_rank, 9039 pattern=nomen_pattern, 9040 transcripts_source_order=transcripts_order, 9041 transcripts_len=transcripts_len, 9042 ), 9043 axis=1, 9044 ) 9045 9046 # Explode NOMEN Structure and create SQL set for update 9047 sql_nomen_fields = [] 9048 for nomen_field in nomen_dict: 9049 9050 # Create VCF header field 9051 vcf_reader.infos[nomen_field] = vcf.parser._Info( 9052 nomen_field, 9053 ".", 9054 "String", 9055 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 9056 "howard calculation", 9057 "0", 9058 self.code_type_map.get("String"), 9059 ) 9060 9061 # Add field to SQL query update 9062 sql_nomen_fields.append( 9063 f""" 9064 CASE 9065 WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('') 9066 THEN concat( 9067 ';{nomen_field}=', 9068 dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" 9069 ) 9070 ELSE '' 9071 END 9072 """ 9073 ) 9074 9075 # SQL set for update 9076 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 9077 9078 # Update 9079 sql_update = f""" 9080 UPDATE variants 9081 SET "INFO" = 9082 concat( 9083 CASE 9084 WHEN "INFO" IS NULL 9085 THEN '' 9086 ELSE "INFO" 9087 END, 9088 {sql_nomen_fields_set} 9089 ) 9090 FROM dataframe_hgvs 9091 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 9092 AND variants."POS" = dataframe_hgvs."POS" 9093 AND variants."REF" = dataframe_hgvs."REF" 9094 AND variants."ALT" = dataframe_hgvs."ALT" 9095 """ 9096 self.conn.execute(sql_update) 9097 9098 # Delete dataframe 9099 del dataframe_hgvs 9100 gc.collect() 9101 9102 # Remove added columns 9103 for added_column in added_columns: 9104 self.drop_column(column=added_column)
This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
9106 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 9107 """ 9108 The function `calculation_find_by_pipeline` performs a calculation to find the number of 9109 pipeline/sample for a variant and updates the variant information in a VCF file. 9110 9111 :param tag: The `tag` parameter is a string that represents the annotation field for the 9112 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 9113 VCF header and to update the corresponding field in the variants table, defaults to 9114 findbypipeline 9115 :type tag: str (optional) 9116 """ 9117 9118 # if FORMAT and samples 9119 if ( 9120 "FORMAT" in self.get_header_columns_as_list() 9121 and self.get_header_sample_list() 9122 ): 9123 9124 # findbypipeline annotation field 9125 findbypipeline_tag = tag 9126 9127 # VCF infos tags 9128 vcf_infos_tags = { 9129 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 9130 } 9131 9132 # Prefix 9133 prefix = self.get_explode_infos_prefix() 9134 9135 # Field 9136 findbypipeline_infos = prefix + findbypipeline_tag 9137 9138 # Variants table 9139 table_variants = self.get_table_variants() 9140 9141 # Header 9142 vcf_reader = self.get_header() 9143 9144 # Create variant id 9145 variant_id_column = self.get_variant_id_column() 9146 added_columns = [variant_id_column] 9147 9148 # variant_id, FORMAT and samples 9149 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9150 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9151 ) 9152 9153 # Create dataframe 9154 dataframe_findbypipeline = self.get_query_to_df( 9155 f""" SELECT {samples_fields} FROM {table_variants} """ 9156 ) 9157 9158 # Create findbypipeline column 9159 dataframe_findbypipeline[findbypipeline_infos] = ( 9160 dataframe_findbypipeline.apply( 9161 lambda row: findbypipeline( 9162 row, samples=self.get_header_sample_list() 9163 ), 9164 axis=1, 9165 ) 9166 ) 9167 9168 # Add snpeff_hgvs to header 9169 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 9170 findbypipeline_tag, 9171 ".", 9172 "String", 9173 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 9174 "howard calculation", 9175 "0", 9176 self.code_type_map.get("String"), 9177 ) 9178 9179 # Update 9180 sql_update = f""" 9181 UPDATE variants 9182 SET "INFO" = 9183 concat( 9184 CASE 9185 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9186 THEN '' 9187 ELSE concat("INFO", ';') 9188 END, 9189 CASE 9190 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 9191 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 9192 THEN concat( 9193 '{findbypipeline_tag}=', 9194 dataframe_findbypipeline."{findbypipeline_infos}" 9195 ) 9196 ELSE '' 9197 END 9198 ) 9199 FROM dataframe_findbypipeline 9200 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 9201 """ 9202 self.conn.execute(sql_update) 9203 9204 # Remove added columns 9205 for added_column in added_columns: 9206 self.drop_column(column=added_column) 9207 9208 # Delete dataframe 9209 del dataframe_findbypipeline 9210 gc.collect()
The function calculation_find_by_pipeline performs a calculation to find the number of
pipeline/sample for a variant and updates the variant information in a VCF file.
Parameters
- tag: The
tagparameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
9212 def calculation_genotype_concordance(self) -> None: 9213 """ 9214 The function `calculation_genotype_concordance` calculates the genotype concordance for 9215 multi-caller VCF files and updates the variant information in the database. 9216 """ 9217 9218 # if FORMAT and samples 9219 if ( 9220 "FORMAT" in self.get_header_columns_as_list() 9221 and self.get_header_sample_list() 9222 ): 9223 9224 # genotypeconcordance annotation field 9225 genotypeconcordance_tag = "genotypeconcordance" 9226 9227 # VCF infos tags 9228 vcf_infos_tags = { 9229 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 9230 } 9231 9232 # Prefix 9233 prefix = self.get_explode_infos_prefix() 9234 9235 # Field 9236 genotypeconcordance_infos = prefix + genotypeconcordance_tag 9237 9238 # Variants table 9239 table_variants = self.get_table_variants() 9240 9241 # Header 9242 vcf_reader = self.get_header() 9243 9244 # Create variant id 9245 variant_id_column = self.get_variant_id_column() 9246 added_columns = [variant_id_column] 9247 9248 # variant_id, FORMAT and samples 9249 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9250 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9251 ) 9252 9253 # Create dataframe 9254 dataframe_genotypeconcordance = self.get_query_to_df( 9255 f""" SELECT {samples_fields} FROM {table_variants} """ 9256 ) 9257 9258 # Create genotypeconcordance column 9259 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 9260 dataframe_genotypeconcordance.apply( 9261 lambda row: genotypeconcordance( 9262 row, samples=self.get_header_sample_list() 9263 ), 9264 axis=1, 9265 ) 9266 ) 9267 9268 # Add genotypeconcordance to header 9269 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 9270 genotypeconcordance_tag, 9271 ".", 9272 "String", 9273 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 9274 "howard calculation", 9275 "0", 9276 self.code_type_map.get("String"), 9277 ) 9278 9279 # Update 9280 sql_update = f""" 9281 UPDATE variants 9282 SET "INFO" = 9283 concat( 9284 CASE 9285 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9286 THEN '' 9287 ELSE concat("INFO", ';') 9288 END, 9289 CASE 9290 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 9291 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 9292 THEN concat( 9293 '{genotypeconcordance_tag}=', 9294 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 9295 ) 9296 ELSE '' 9297 END 9298 ) 9299 FROM dataframe_genotypeconcordance 9300 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 9301 """ 9302 self.conn.execute(sql_update) 9303 9304 # Remove added columns 9305 for added_column in added_columns: 9306 self.drop_column(column=added_column) 9307 9308 # Delete dataframe 9309 del dataframe_genotypeconcordance 9310 gc.collect()
The function calculation_genotype_concordance calculates the genotype concordance for
multi-caller VCF files and updates the variant information in the database.
9312 def calculation_barcode(self, tag: str = "barcode") -> None: 9313 """ 9314 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 9315 updates the INFO field in the file with the calculated barcode values. 9316 9317 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 9318 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 9319 the default tag name is set to "barcode", defaults to barcode 9320 :type tag: str (optional) 9321 """ 9322 9323 # if FORMAT and samples 9324 if ( 9325 "FORMAT" in self.get_header_columns_as_list() 9326 and self.get_header_sample_list() 9327 ): 9328 9329 # barcode annotation field 9330 if not tag: 9331 tag = "barcode" 9332 9333 # VCF infos tags 9334 vcf_infos_tags = { 9335 tag: "barcode calculation (VaRank)", 9336 } 9337 9338 # Prefix 9339 prefix = self.get_explode_infos_prefix() 9340 9341 # Field 9342 barcode_infos = prefix + tag 9343 9344 # Variants table 9345 table_variants = self.get_table_variants() 9346 9347 # Header 9348 vcf_reader = self.get_header() 9349 9350 # Create variant id 9351 variant_id_column = self.get_variant_id_column() 9352 added_columns = [variant_id_column] 9353 9354 # variant_id, FORMAT and samples 9355 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9356 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9357 ) 9358 9359 # Create dataframe 9360 dataframe_barcode = self.get_query_to_df( 9361 f""" SELECT {samples_fields} FROM {table_variants} """ 9362 ) 9363 9364 # Create barcode column 9365 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9366 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 9367 ) 9368 9369 # Add barcode to header 9370 vcf_reader.infos[tag] = vcf.parser._Info( 9371 tag, 9372 ".", 9373 "String", 9374 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 9375 "howard calculation", 9376 "0", 9377 self.code_type_map.get("String"), 9378 ) 9379 9380 # Update 9381 sql_update = f""" 9382 UPDATE {table_variants} 9383 SET "INFO" = 9384 concat( 9385 CASE 9386 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9387 THEN '' 9388 ELSE concat("INFO", ';') 9389 END, 9390 CASE 9391 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 9392 AND dataframe_barcode."{barcode_infos}" NOT NULL 9393 THEN concat( 9394 '{tag}=', 9395 dataframe_barcode."{barcode_infos}" 9396 ) 9397 ELSE '' 9398 END 9399 ) 9400 FROM dataframe_barcode 9401 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9402 """ 9403 self.conn.execute(sql_update) 9404 9405 # Remove added columns 9406 for added_column in added_columns: 9407 self.drop_column(column=added_column) 9408 9409 # Delete dataframe 9410 del dataframe_barcode 9411 gc.collect()
The calculation_barcode function calculates barcode values for variants in a VCF file and
updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcodefunction is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
9413 def calculation_barcode_family(self, tag: str = "BCF") -> None: 9414 """ 9415 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 9416 and updates the INFO field in the file with the calculated barcode values. 9417 9418 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 9419 the barcode tag that will be added to the VCF file during the calculation process. If no value 9420 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 9421 :type tag: str (optional) 9422 """ 9423 9424 # if FORMAT and samples 9425 if ( 9426 "FORMAT" in self.get_header_columns_as_list() 9427 and self.get_header_sample_list() 9428 ): 9429 9430 # barcode annotation field 9431 if not tag: 9432 tag = "BCF" 9433 9434 # VCF infos tags 9435 vcf_infos_tags = { 9436 tag: "barcode family calculation", 9437 f"{tag}S": "barcode family samples", 9438 } 9439 9440 # Param 9441 param = self.get_param() 9442 log.debug(f"param={param}") 9443 9444 # Prefix 9445 prefix = self.get_explode_infos_prefix() 9446 9447 # PED param 9448 ped = ( 9449 param.get("calculation", {}) 9450 .get("calculations", {}) 9451 .get("BARCODEFAMILY", {}) 9452 .get("family_pedigree", None) 9453 ) 9454 log.debug(f"ped={ped}") 9455 9456 # Load PED 9457 if ped: 9458 9459 # Pedigree is a file 9460 if isinstance(ped, str) and os.path.exists(full_path(ped)): 9461 log.debug("Pedigree is file") 9462 with open(full_path(ped)) as ped: 9463 ped = yaml.safe_load(ped) 9464 9465 # Pedigree is a string 9466 elif isinstance(ped, str): 9467 log.debug("Pedigree is str") 9468 try: 9469 ped = json.loads(ped) 9470 log.debug("Pedigree is json str") 9471 except ValueError as e: 9472 ped_samples = ped.split(",") 9473 ped = {} 9474 for ped_sample in ped_samples: 9475 ped[ped_sample] = ped_sample 9476 9477 # Pedigree is a dict 9478 elif isinstance(ped, dict): 9479 log.debug("Pedigree is dict") 9480 9481 # Pedigree is not well formatted 9482 else: 9483 msg_error = "Pedigree not well formatted" 9484 log.error(msg_error) 9485 raise ValueError(msg_error) 9486 9487 # Construct list 9488 ped_samples = list(ped.values()) 9489 9490 else: 9491 log.debug("Pedigree not defined. Take all samples") 9492 ped_samples = self.get_header_sample_list() 9493 ped = {} 9494 for ped_sample in ped_samples: 9495 ped[ped_sample] = ped_sample 9496 9497 # Check pedigree 9498 if not ped or len(ped) == 0: 9499 msg_error = f"Error in pedigree: samples {ped_samples}" 9500 log.error(msg_error) 9501 raise ValueError(msg_error) 9502 9503 # Log 9504 log.info( 9505 "Calculation 'BARCODEFAMILY' - Samples: " 9506 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9507 ) 9508 log.debug(f"ped_samples={ped_samples}") 9509 9510 # Field 9511 barcode_infos = prefix + tag 9512 9513 # Variants table 9514 table_variants = self.get_table_variants() 9515 9516 # Header 9517 vcf_reader = self.get_header() 9518 9519 # Create variant id 9520 variant_id_column = self.get_variant_id_column() 9521 added_columns = [variant_id_column] 9522 9523 # variant_id, FORMAT and samples 9524 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9525 [f""" "{sample}" """ for sample in ped_samples] 9526 ) 9527 9528 # Create dataframe 9529 dataframe_barcode = self.get_query_to_df( 9530 f""" SELECT {samples_fields} FROM {table_variants} """ 9531 ) 9532 9533 # Create barcode column 9534 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9535 lambda row: barcode(row, samples=ped_samples), axis=1 9536 ) 9537 9538 # Add barcode family to header 9539 # Add vaf_normalization to header 9540 vcf_reader.formats[tag] = vcf.parser._Format( 9541 id=tag, 9542 num=".", 9543 type="String", 9544 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9545 type_code=self.code_type_map.get("String"), 9546 ) 9547 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9548 id=f"{tag}S", 9549 num=".", 9550 type="String", 9551 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9552 type_code=self.code_type_map.get("String"), 9553 ) 9554 9555 # Update 9556 # for sample in ped_samples: 9557 sql_update_set = [] 9558 for sample in self.get_header_sample_list() + ["FORMAT"]: 9559 if sample in ped_samples: 9560 value = f'dataframe_barcode."{barcode_infos}"' 9561 value_samples = ( 9562 "'" 9563 + ",".join([f""" "{sample}" """ for sample in ped_samples]) 9564 + "'" 9565 ) 9566 ped_samples 9567 elif sample == "FORMAT": 9568 value = f"'{tag}'" 9569 value_samples = f"'{tag}S'" 9570 else: 9571 value = "'.'" 9572 value_samples = "'.'" 9573 format_regex = r"[a-zA-Z0-9\s]" 9574 sql_update_set.append( 9575 f""" 9576 "{sample}" = 9577 concat( 9578 CASE 9579 WHEN {table_variants}."{sample}" = './.' 9580 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9581 ELSE {table_variants}."{sample}" 9582 END, 9583 ':', 9584 {value}, 9585 ':', 9586 {value_samples} 9587 ) 9588 """ 9589 ) 9590 9591 sql_update_set_join = ", ".join(sql_update_set) 9592 sql_update = f""" 9593 UPDATE {table_variants} 9594 SET {sql_update_set_join} 9595 FROM dataframe_barcode 9596 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9597 """ 9598 self.conn.execute(sql_update) 9599 9600 # Remove added columns 9601 for added_column in added_columns: 9602 self.drop_column(column=added_column) 9603 9604 # Delete dataframe 9605 del dataframe_barcode 9606 gc.collect()
The calculation_barcode_family function calculates barcode values for variants in a VCF file
and updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcode_familyfunction is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for thetagparameter, the default value used is "BCF", defaults to BCF
9608 def calculation_trio(self) -> None: 9609 """ 9610 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9611 information to the INFO field of each variant. 9612 """ 9613 9614 # if FORMAT and samples 9615 if ( 9616 "FORMAT" in self.get_header_columns_as_list() 9617 and self.get_header_sample_list() 9618 ): 9619 9620 # trio annotation field 9621 trio_tag = "trio" 9622 9623 # VCF infos tags 9624 vcf_infos_tags = { 9625 "trio": "trio calculation", 9626 } 9627 9628 # Param 9629 param = self.get_param() 9630 9631 # Prefix 9632 prefix = self.get_explode_infos_prefix() 9633 9634 # Trio param 9635 trio_ped = ( 9636 param.get("calculation", {}) 9637 .get("calculations", {}) 9638 .get("TRIO", {}) 9639 .get("trio_pedigree", None) 9640 ) 9641 9642 # Load trio 9643 if trio_ped: 9644 9645 # Trio pedigree is a file 9646 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9647 log.debug("TRIO pedigree is file") 9648 with open(full_path(trio_ped)) as trio_ped: 9649 trio_ped = yaml.safe_load(trio_ped) 9650 9651 # Trio pedigree is a string 9652 elif isinstance(trio_ped, str): 9653 log.debug("TRIO pedigree is str") 9654 try: 9655 trio_ped = json.loads(trio_ped) 9656 log.debug("TRIO pedigree is json str") 9657 except ValueError as e: 9658 trio_samples = trio_ped.split(",") 9659 if len(trio_samples) == 3: 9660 trio_ped = { 9661 "father": trio_samples[0], 9662 "mother": trio_samples[1], 9663 "child": trio_samples[2], 9664 } 9665 log.debug("TRIO pedigree is list str") 9666 else: 9667 msg_error = "TRIO pedigree not well formatted" 9668 log.error(msg_error) 9669 raise ValueError(msg_error) 9670 9671 # Trio pedigree is a dict 9672 elif isinstance(trio_ped, dict): 9673 log.debug("TRIO pedigree is dict") 9674 9675 # Trio pedigree is not well formatted 9676 else: 9677 msg_error = "TRIO pedigree not well formatted" 9678 log.error(msg_error) 9679 raise ValueError(msg_error) 9680 9681 # Construct trio list 9682 trio_samples = [ 9683 trio_ped.get("father", ""), 9684 trio_ped.get("mother", ""), 9685 trio_ped.get("child", ""), 9686 ] 9687 9688 else: 9689 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9690 samples_list = self.get_header_sample_list() 9691 if len(samples_list) >= 3: 9692 trio_samples = self.get_header_sample_list()[0:3] 9693 trio_ped = { 9694 "father": trio_samples[0], 9695 "mother": trio_samples[1], 9696 "child": trio_samples[2], 9697 } 9698 else: 9699 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9700 log.error(msg_error) 9701 raise ValueError(msg_error) 9702 9703 # Check trio pedigree 9704 if not trio_ped or len(trio_ped) != 3: 9705 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9706 log.error(msg_error) 9707 raise ValueError(msg_error) 9708 9709 # Log 9710 log.info( 9711 f"Calculation 'TRIO' - Samples: " 9712 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9713 ) 9714 9715 # Field 9716 trio_infos = prefix + trio_tag 9717 9718 # Variants table 9719 table_variants = self.get_table_variants() 9720 9721 # Header 9722 vcf_reader = self.get_header() 9723 9724 # Create variant id 9725 variant_id_column = self.get_variant_id_column() 9726 added_columns = [variant_id_column] 9727 9728 # variant_id, FORMAT and samples 9729 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9730 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9731 ) 9732 9733 # Create dataframe 9734 dataframe_trio = self.get_query_to_df( 9735 f""" SELECT {samples_fields} FROM {table_variants} """ 9736 ) 9737 9738 # Create trio column 9739 dataframe_trio[trio_infos] = dataframe_trio.apply( 9740 lambda row: trio(row, samples=trio_samples), axis=1 9741 ) 9742 9743 # Add trio to header 9744 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9745 trio_tag, 9746 ".", 9747 "String", 9748 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9749 "howard calculation", 9750 "0", 9751 self.code_type_map.get("String"), 9752 ) 9753 9754 # Update 9755 sql_update = f""" 9756 UPDATE {table_variants} 9757 SET "INFO" = 9758 concat( 9759 CASE 9760 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9761 THEN '' 9762 ELSE concat("INFO", ';') 9763 END, 9764 CASE 9765 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9766 AND dataframe_trio."{trio_infos}" NOT NULL 9767 THEN concat( 9768 '{trio_tag}=', 9769 dataframe_trio."{trio_infos}" 9770 ) 9771 ELSE '' 9772 END 9773 ) 9774 FROM dataframe_trio 9775 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9776 """ 9777 self.conn.execute(sql_update) 9778 9779 # Remove added columns 9780 for added_column in added_columns: 9781 self.drop_column(column=added_column) 9782 9783 # Delete dataframe 9784 del dataframe_trio 9785 gc.collect()
The calculation_trio function performs trio calculations on a VCF file by adding trio
information to the INFO field of each variant.
9787 def calculation_vaf_normalization(self) -> None: 9788 """ 9789 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9790 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9791 :return: The function does not return anything. 9792 """ 9793 9794 # if FORMAT and samples 9795 if ( 9796 "FORMAT" in self.get_header_columns_as_list() 9797 and self.get_header_sample_list() 9798 ): 9799 9800 # vaf_normalization annotation field 9801 vaf_normalization_tag = "VAF" 9802 9803 # VCF infos tags 9804 vcf_infos_tags = { 9805 "VAF": "VAF Variant Frequency", 9806 } 9807 9808 # Prefix 9809 prefix = self.get_explode_infos_prefix() 9810 9811 # Variants table 9812 table_variants = self.get_table_variants() 9813 9814 # Header 9815 vcf_reader = self.get_header() 9816 9817 # Do not calculate if VAF already exists 9818 if "VAF" in vcf_reader.formats: 9819 log.debug("VAF already on genotypes") 9820 return 9821 9822 # Create variant id 9823 variant_id_column = self.get_variant_id_column() 9824 added_columns = [variant_id_column] 9825 9826 # variant_id, FORMAT and samples 9827 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9828 f""" "{sample}" """ for sample in self.get_header_sample_list() 9829 ) 9830 9831 # Create dataframe 9832 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9833 log.debug(f"query={query}") 9834 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9835 9836 vaf_normalization_set = [] 9837 9838 # for each sample vaf_normalization 9839 for sample in self.get_header_sample_list(): 9840 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9841 lambda row: vaf_normalization(row, sample=sample), axis=1 9842 ) 9843 vaf_normalization_set.append( 9844 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9845 ) 9846 9847 # Add VAF to FORMAT 9848 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9849 "FORMAT" 9850 ].apply(lambda x: str(x) + ":VAF") 9851 vaf_normalization_set.append( 9852 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9853 ) 9854 9855 # Add vaf_normalization to header 9856 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9857 id=vaf_normalization_tag, 9858 num="1", 9859 type="Float", 9860 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9861 type_code=self.code_type_map.get("Float"), 9862 ) 9863 9864 # Create fields to add in INFO 9865 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9866 9867 # Update 9868 sql_update = f""" 9869 UPDATE {table_variants} 9870 SET {sql_vaf_normalization_set} 9871 FROM dataframe_vaf_normalization 9872 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9873 9874 """ 9875 self.conn.execute(sql_update) 9876 9877 # Remove added columns 9878 for added_column in added_columns: 9879 self.drop_column(column=added_column) 9880 9881 # Delete dataframe 9882 del dataframe_vaf_normalization 9883 gc.collect()
The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency)
normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
Returns
The function does not return anything.
9885 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9886 """ 9887 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9888 field in a VCF file and updates the INFO column of the variants table with the calculated 9889 statistics. 9890 9891 :param info: The `info` parameter is a string that represents the type of information for which 9892 genotype statistics are calculated. It is used to generate various VCF info tags for the 9893 statistics, such as the number of occurrences, the list of values, the minimum value, the 9894 maximum value, the mean, the median, defaults to VAF 9895 :type info: str (optional) 9896 """ 9897 9898 # if FORMAT and samples 9899 if ( 9900 "FORMAT" in self.get_header_columns_as_list() 9901 and self.get_header_sample_list() 9902 ): 9903 9904 # vaf_stats annotation field 9905 vaf_stats_tag = info + "_stats" 9906 9907 # VCF infos tags 9908 vcf_infos_tags = { 9909 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9910 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9911 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9912 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9913 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9914 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9915 info 9916 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9917 } 9918 9919 # Prefix 9920 prefix = self.get_explode_infos_prefix() 9921 9922 # Field 9923 vaf_stats_infos = prefix + vaf_stats_tag 9924 9925 # Variants table 9926 table_variants = self.get_table_variants() 9927 9928 # Header 9929 vcf_reader = self.get_header() 9930 9931 # Create variant id 9932 variant_id_column = self.get_variant_id_column() 9933 added_columns = [variant_id_column] 9934 9935 # variant_id, FORMAT and samples 9936 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9937 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9938 ) 9939 9940 # Create dataframe 9941 dataframe_vaf_stats = self.get_query_to_df( 9942 f""" SELECT {samples_fields} FROM {table_variants} """ 9943 ) 9944 9945 # Create vaf_stats column 9946 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9947 lambda row: genotype_stats( 9948 row, samples=self.get_header_sample_list(), info=info 9949 ), 9950 axis=1, 9951 ) 9952 9953 # List of vcf tags 9954 sql_vaf_stats_fields = [] 9955 9956 # Check all VAF stats infos 9957 for stat in vcf_infos_tags: 9958 9959 # Extract stats 9960 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9961 lambda x: dict(x).get(stat, "") 9962 ) 9963 9964 # Add snpeff_hgvs to header 9965 vcf_reader.infos[stat] = vcf.parser._Info( 9966 stat, 9967 ".", 9968 "String", 9969 vcf_infos_tags.get(stat, "genotype statistics"), 9970 "howard calculation", 9971 "0", 9972 self.code_type_map.get("String"), 9973 ) 9974 9975 if len(sql_vaf_stats_fields): 9976 sep = ";" 9977 else: 9978 sep = "" 9979 9980 # Create fields to add in INFO 9981 sql_vaf_stats_fields.append( 9982 f""" 9983 CASE 9984 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9985 THEN concat( 9986 '{sep}{stat}=', 9987 dataframe_vaf_stats."{stat}" 9988 ) 9989 ELSE '' 9990 END 9991 """ 9992 ) 9993 9994 # SQL set for update 9995 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9996 9997 # Update 9998 sql_update = f""" 9999 UPDATE {table_variants} 10000 SET "INFO" = 10001 concat( 10002 CASE 10003 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10004 THEN '' 10005 ELSE concat("INFO", ';') 10006 END, 10007 {sql_vaf_stats_fields_set} 10008 ) 10009 FROM dataframe_vaf_stats 10010 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 10011 10012 """ 10013 self.conn.execute(sql_update) 10014 10015 # Remove added columns 10016 for added_column in added_columns: 10017 self.drop_column(column=added_column) 10018 10019 # Delete dataframe 10020 del dataframe_vaf_stats 10021 gc.collect()
The calculation_genotype_stats function calculates genotype statistics for a given information
field in a VCF file and updates the INFO column of the variants table with the calculated
statistics.
Parameters
- info: The
infoparameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
10023 def calculation_transcripts_annotation( 10024 self, info_json: str = None, info_format: str = None 10025 ) -> None: 10026 """ 10027 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 10028 field to it if transcripts are available. 10029 10030 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 10031 is a string parameter that represents the information field to be used in the transcripts JSON. 10032 It is used to specify the JSON format for the transcripts information. If no value is provided 10033 when calling the method, it defaults to " 10034 :type info_json: str 10035 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 10036 method is a string parameter that specifies the format of the information field to be used in 10037 the transcripts JSON. It is used to define the format of the information field 10038 :type info_format: str 10039 """ 10040 10041 # Create transcripts table 10042 transcripts_table = self.create_transcript_view() 10043 10044 # Add info field 10045 if transcripts_table: 10046 self.transcript_view_to_variants( 10047 transcripts_table=transcripts_table, 10048 transcripts_info_field_json=info_json, 10049 transcripts_info_field_format=info_format, 10050 ) 10051 else: 10052 log.info("No Transcripts to process. Check param.json file configuration")
The calculation_transcripts_annotation function creates a transcripts table and adds an info
field to it if transcripts are available.
Parameters
- info_json: The
info_jsonparameter in thecalculation_transcripts_annotationmethod is a string parameter that represents the information field to be used in the transcripts JSON. It is used to specify the JSON format for the transcripts information. If no value is provided when calling the method, it defaults to " - info_format: The
info_formatparameter in thecalculation_transcripts_annotationmethod is a string parameter that specifies the format of the information field to be used in the transcripts JSON. It is used to define the format of the information field
10054 def calculation_transcripts_prioritization(self) -> None: 10055 """ 10056 The function `calculation_transcripts_prioritization` creates a transcripts table and 10057 prioritizes transcripts based on certain criteria. 10058 """ 10059 10060 # Create transcripts table 10061 transcripts_table = self.create_transcript_view() 10062 10063 # Add info field 10064 if transcripts_table: 10065 self.transcripts_prioritization(transcripts_table=transcripts_table) 10066 else: 10067 log.info("No Transcripts to process. Check param.json file configuration")
The function calculation_transcripts_prioritization creates a transcripts table and
prioritizes transcripts based on certain criteria.
10069 def calculation_transcripts_export(self) -> None: 10070 """ """ 10071 10072 # Create transcripts table 10073 transcripts_table = self.create_transcript_view() 10074 10075 # Add info field 10076 if transcripts_table: 10077 self.transcripts_export(transcripts_table=transcripts_table) 10078 else: 10079 log.info("No Transcripts to process. Check param.json file configuration")
10085 def transcripts_export( 10086 self, transcripts_table: str = None, param: dict = {} 10087 ) -> bool: 10088 """ """ 10089 10090 log.debug("Start transcripts export...") 10091 10092 # Param 10093 if not param: 10094 param = self.get_param() 10095 10096 # Param export 10097 param_transcript_export = param.get("transcripts", {}).get("export", {}) 10098 10099 # Output file 10100 transcripts_export_output = param_transcript_export.get("output", None) 10101 10102 if not param_transcript_export or not transcripts_export_output: 10103 log.warning(f"No transcriipts export parameters defined!") 10104 return False 10105 10106 # List of transcripts annotations 10107 query_describe = f""" 10108 SELECT column_name 10109 FROM ( 10110 DESCRIBE SELECT * FROM {transcripts_table} 10111 ) 10112 WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO') 10113 """ 10114 transcripts_annotations_list = list( 10115 self.get_query_to_df(query=query_describe)["column_name"] 10116 ) 10117 10118 # Create transcripts table for export 10119 transcripts_table_export = f"{transcripts_table}_export_" + "".join( 10120 random.choices(string.ascii_uppercase + string.digits, k=10) 10121 ) 10122 query_create_transcripts_table_export = f""" 10123 CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table}) 10124 """ 10125 self.execute_query(query=query_create_transcripts_table_export) 10126 10127 # Output file format 10128 transcripts_export_output_format = get_file_format( 10129 filename=transcripts_export_output 10130 ) 10131 10132 # Format VCF - construct INFO 10133 if transcripts_export_output_format in ["vcf"]: 10134 10135 # Construct query update INFO and header 10136 query_update_info = [] 10137 for field in transcripts_annotations_list: 10138 10139 # If field not in header 10140 if field not in self.get_header_infos_list(): 10141 10142 # Add PZ Transcript in header 10143 self.get_header().infos[field] = vcf.parser._Info( 10144 field, 10145 ".", 10146 "String", 10147 f"Annotation '{field}' from transcript view", 10148 "unknown", 10149 "unknown", 10150 0, 10151 ) 10152 10153 # Add field as INFO/tag 10154 query_update_info.append( 10155 f""" 10156 CASE 10157 WHEN "{field}" IS NOT NULL 10158 THEN concat('{field}=', "{field}", ';') 10159 ELSE '' 10160 END 10161 """ 10162 ) 10163 10164 # Query param 10165 query_update_info_value = ( 10166 f""" concat('', {", ".join(query_update_info)}) """ 10167 ) 10168 query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """ 10169 10170 else: 10171 10172 # Query param 10173 query_update_info_value = f""" NULL """ 10174 query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """ 10175 10176 # Update query INFO column 10177 query_update = f""" 10178 UPDATE {transcripts_table_export} 10179 SET INFO = {query_update_info_value} 10180 10181 """ 10182 self.execute_query(query=query_update) 10183 10184 # Export 10185 self.export_output( 10186 output_file=transcripts_export_output, 10187 query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """, 10188 ) 10189 10190 # Drop transcripts export table 10191 query_drop_transcripts_table_export = f""" 10192 DROP TABLE {transcripts_table_export} 10193 """ 10194 self.execute_query(query=query_drop_transcripts_table_export)
10196 def transcripts_prioritization( 10197 self, transcripts_table: str = None, param: dict = {} 10198 ) -> bool: 10199 """ 10200 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 10201 and updates the variants table with the prioritized information. 10202 10203 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10204 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 10205 This parameter is used to identify the table where the transcripts data is stored for the 10206 prioritization process 10207 :type transcripts_table: str 10208 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 10209 that contains various configuration settings for the prioritization process of transcripts. It 10210 is used to customize the behavior of the prioritization algorithm and includes settings such as 10211 the prefix for prioritization fields, default profiles, and other 10212 :type param: dict 10213 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 10214 transcripts prioritization process is successfully completed, and `False` if there are any 10215 issues or if no profile is defined for transcripts prioritization. 10216 """ 10217 10218 log.debug("Start transcripts prioritization...") 10219 10220 # Param 10221 if not param: 10222 param = self.get_param() 10223 10224 # Variants table 10225 table_variants = self.get_table_variants() 10226 10227 # Transcripts table 10228 if transcripts_table is None: 10229 transcripts_table = self.create_transcript_view( 10230 transcripts_table="transcripts", param=param 10231 ) 10232 if transcripts_table is None: 10233 msg_err = "No Transcripts table availalble" 10234 log.error(msg_err) 10235 raise ValueError(msg_err) 10236 log.debug(f"transcripts_table={transcripts_table}") 10237 10238 # Get transcripts columns 10239 columns_as_list_query = f""" 10240 DESCRIBE {transcripts_table} 10241 """ 10242 columns_as_list = list( 10243 self.get_query_to_df(columns_as_list_query)["column_name"] 10244 ) 10245 10246 # Create INFO if not exists 10247 if "INFO" not in columns_as_list: 10248 query_add_info = f""" 10249 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 10250 """ 10251 self.execute_query(query_add_info) 10252 10253 # Prioritization param and Force only PZ Score and Flag 10254 pz_param = param.get("transcripts", {}).get("prioritization", {}) 10255 10256 # PZ profile by default 10257 pz_profile_default = ( 10258 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 10259 ) 10260 10261 # Exit if no profile 10262 if pz_profile_default is None: 10263 log.warning("No profile defined for transcripts prioritization") 10264 return False 10265 10266 # PZ fields 10267 pz_param_pzfields = {} 10268 10269 # PZ field transcripts 10270 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 10271 10272 # Add PZ Transcript in header 10273 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 10274 pz_fields_transcripts, 10275 ".", 10276 "String", 10277 f"Transcript selected from prioritization process, profile {pz_profile_default}", 10278 "unknown", 10279 "unknown", 10280 code_type_map["String"], 10281 ) 10282 10283 # Mandatory fields 10284 pz_mandatory_fields_list = [ 10285 "Score", 10286 "Flag", 10287 "Tags", 10288 "Comment", 10289 "Infos", 10290 "Class", 10291 ] 10292 pz_mandatory_fields = [] 10293 for pz_mandatory_field in pz_mandatory_fields_list: 10294 pz_mandatory_fields.append( 10295 pz_param.get("pzprefix", "PTZ") + pz_mandatory_field 10296 ) 10297 10298 # PZ fields in param 10299 for pz_field in pz_param.get("pzfields", []): 10300 if pz_field in pz_mandatory_fields_list: 10301 pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = ( 10302 pz_param.get("pzprefix", "PTZ") + pz_field 10303 ) 10304 else: 10305 pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field 10306 pz_param_pzfields[pz_field] = pz_field_new 10307 10308 # Add PZ Transcript in header 10309 self.get_header().infos[pz_field_new] = vcf.parser._Info( 10310 pz_field_new, 10311 ".", 10312 "String", 10313 f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}", 10314 "unknown", 10315 "unknown", 10316 code_type_map["String"], 10317 ) 10318 10319 # PZ fields param 10320 pz_param["pzfields"] = pz_mandatory_fields 10321 10322 # Prioritization 10323 prioritization_result = self.prioritization( 10324 table=transcripts_table, 10325 pz_param=param.get("transcripts", {}).get("prioritization", {}), 10326 ) 10327 if not prioritization_result: 10328 log.warning("Transcripts prioritization not processed") 10329 return False 10330 10331 # PZ fields sql query 10332 query_update_select_list = [] 10333 query_update_concat_list = [] 10334 query_update_order_list = [] 10335 for pz_param_pzfield in set( 10336 list(pz_param_pzfields.keys()) + pz_mandatory_fields 10337 ): 10338 query_update_select_list.append(f" {pz_param_pzfield}, ") 10339 10340 for pz_param_pzfield in pz_param_pzfields: 10341 query_update_concat_list.append( 10342 f""" 10343 , CASE 10344 WHEN {pz_param_pzfield} IS NOT NULL 10345 THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield}) 10346 ELSE '' 10347 END 10348 """ 10349 ) 10350 10351 # Order by 10352 pz_orders = ( 10353 param.get("transcripts", {}) 10354 .get("prioritization", {}) 10355 .get("prioritization_transcripts_order", {}) 10356 ) 10357 if not pz_orders: 10358 pz_orders = { 10359 pz_param.get("pzprefix", "PTZ") + "Flag": "DESC", 10360 pz_param.get("pzprefix", "PTZ") + "Score": "DESC", 10361 } 10362 for pz_order in pz_orders: 10363 query_update_order_list.append( 10364 f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """ 10365 ) 10366 10367 # Fields to explode 10368 fields_to_explode = ( 10369 list(pz_param_pzfields.keys()) 10370 + pz_mandatory_fields 10371 + list(pz_orders.keys()) 10372 ) 10373 # Remove transcript column as a specific transcript column 10374 if "transcript" in fields_to_explode: 10375 fields_to_explode.remove("transcript") 10376 10377 # Fields intranscripts table 10378 query_transcripts_table = f""" 10379 DESCRIBE SELECT * FROM {transcripts_table} 10380 """ 10381 query_transcripts_table = self.get_query_to_df(query=query_transcripts_table) 10382 10383 # Check fields to explode 10384 for field_to_explode in fields_to_explode: 10385 if field_to_explode not in self.get_header_infos_list() + list( 10386 query_transcripts_table.column_name 10387 ): 10388 msg_err = f"INFO/{field_to_explode} NOT IN header" 10389 log.error(msg_err) 10390 raise ValueError(msg_err) 10391 10392 # Explode fields to explode 10393 self.explode_infos( 10394 table=transcripts_table, 10395 fields=fields_to_explode, 10396 ) 10397 10398 # Transcript preference file 10399 transcripts_preference_file = ( 10400 param.get("transcripts", {}) 10401 .get("prioritization", {}) 10402 .get("prioritization_transcripts", {}) 10403 ) 10404 transcripts_preference_file = full_path(transcripts_preference_file) 10405 10406 # Transcript preference forced 10407 transcript_preference_force = ( 10408 param.get("transcripts", {}) 10409 .get("prioritization", {}) 10410 .get("prioritization_transcripts_force", False) 10411 ) 10412 # Transcript version forced 10413 transcript_version_force = ( 10414 param.get("transcripts", {}) 10415 .get("prioritization", {}) 10416 .get("prioritization_transcripts_version_force", False) 10417 ) 10418 10419 # Transcripts Ranking 10420 if transcripts_preference_file: 10421 10422 # Transcripts file to dataframe 10423 if os.path.exists(transcripts_preference_file): 10424 transcripts_preference_dataframe = transcripts_file_to_df( 10425 transcripts_preference_file 10426 ) 10427 else: 10428 log.error( 10429 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10430 ) 10431 raise ValueError( 10432 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10433 ) 10434 10435 # Order by depending to transcript preference forcing 10436 if transcript_preference_force: 10437 order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """ 10438 else: 10439 order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """ 10440 10441 # Transcript columns joined depend on version consideration 10442 if transcript_version_force: 10443 transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """ 10444 else: 10445 transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """ 10446 10447 # Query ranking for update 10448 query_update_ranking = f""" 10449 SELECT 10450 "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)} 10451 ROW_NUMBER() OVER ( 10452 PARTITION BY "#CHROM", POS, REF, ALT 10453 ORDER BY {order_by} 10454 ) AS rn 10455 FROM {transcripts_table} 10456 LEFT JOIN 10457 ( 10458 SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order 10459 FROM transcripts_preference_dataframe 10460 ) AS transcripts_preference 10461 ON {transcripts_version_join} 10462 """ 10463 10464 else: 10465 10466 # Query ranking for update 10467 query_update_ranking = f""" 10468 SELECT 10469 "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)} 10470 ROW_NUMBER() OVER ( 10471 PARTITION BY "#CHROM", POS, REF, ALT 10472 ORDER BY {" , ".join(query_update_order_list)} 10473 ) AS rn 10474 FROM {transcripts_table} 10475 """ 10476 10477 # Export Transcripts prioritization infos to variants table 10478 query_update = f""" 10479 WITH RankedTranscripts AS ( 10480 {query_update_ranking} 10481 ) 10482 UPDATE {table_variants} 10483 SET 10484 INFO = CONCAT(CASE 10485 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10486 THEN '' 10487 ELSE concat("INFO", ';') 10488 END, 10489 concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)}) 10490 ) 10491 FROM 10492 RankedTranscripts 10493 WHERE 10494 rn = 1 10495 AND variants."#CHROM" = RankedTranscripts."#CHROM" 10496 AND variants."POS" = RankedTranscripts."POS" 10497 AND variants."REF" = RankedTranscripts."REF" 10498 AND variants."ALT" = RankedTranscripts."ALT" 10499 """ 10500 10501 # log.debug(f"query_update={query_update}") 10502 self.execute_query(query=query_update) 10503 10504 # Return 10505 return True
The transcripts_prioritization function prioritizes transcripts based on certain parameters
and updates the variants table with the prioritized information.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process - param: The
paramparameter in thetranscripts_prioritizationmethod is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns
The function
transcripts_prioritizationreturns a boolean valueTrueif the transcripts prioritization process is successfully completed, andFalseif there are any issues or if no profile is defined for transcripts prioritization.
10507 def create_transcript_view_from_columns_map( 10508 self, 10509 transcripts_table: str = "transcripts", 10510 columns_maps: dict = {}, 10511 added_columns: list = [], 10512 temporary_tables: list = None, 10513 annotation_fields: list = None, 10514 column_rename: dict = {}, 10515 column_clean: bool = False, 10516 column_case: str = None, 10517 ) -> tuple[list, list, list]: 10518 """ 10519 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 10520 specified columns mapping for transcripts data. 10521 10522 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10523 of the table where the transcripts data is stored or will be stored in the database. This table 10524 typically contains information about transcripts such as Ensembl transcript IDs, gene names, 10525 scores, predictions, etc. It defaults to "transcripts, defaults to transcripts 10526 :type transcripts_table: str (optional) 10527 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information 10528 about how to map columns from a transcripts table to create a view. Each entry in the 10529 `columns_maps` list represents a mapping configuration for a specific set of columns. It 10530 typically includes details such as the main transcript column and additional information columns 10531 :type columns_maps: dict 10532 :param added_columns: The `added_columns` parameter in the 10533 `create_transcript_view_from_columns_map` function is a list that stores the additional columns 10534 that will be added to the view being created based on the columns map provided. These columns 10535 are generated by exploding the transcript information columns along with the main transcript 10536 column 10537 :type added_columns: list 10538 :param temporary_tables: The `temporary_tables` parameter in the 10539 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 10540 tables created during the process of creating a transcript view from a columns map. These 10541 temporary tables are used to store intermediate results or transformations before the final view 10542 is generated 10543 :type temporary_tables: list 10544 :param annotation_fields: The `annotation_fields` parameter in the 10545 `create_transcript_view_from_columns_map` function is a list that stores the fields that are 10546 used for annotation in the query view creation process. These fields are extracted from the 10547 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 10548 :type annotation_fields: list 10549 :param column_rename: The `column_rename` parameter in the 10550 `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify 10551 custom renaming for columns during the creation of the temporary table view. This parameter 10552 provides a mapping of original column names to the desired renamed column names. By using this 10553 parameter, 10554 :type column_rename: dict 10555 :param column_clean: The `column_clean` parameter in the 10556 `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the 10557 column values should be cleaned or not. If set to `True`, the column values will be cleaned by 10558 removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to 10559 False 10560 :type column_clean: bool (optional) 10561 :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map` 10562 function is used to specify the case transformation to be applied to the columns during the view 10563 creation process. It allows you to control whether the column values should be converted to 10564 lowercase, uppercase, or remain unchanged 10565 :type column_case: str 10566 :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three 10567 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 10568 """ 10569 10570 log.debug("Start transcrpts view creation from columns map...") 10571 10572 # "from_columns_map": [ 10573 # { 10574 # "transcripts_column": "Ensembl_transcriptid", 10575 # "transcripts_infos_columns": [ 10576 # "genename", 10577 # "Ensembl_geneid", 10578 # "LIST_S2_score", 10579 # "LIST_S2_pred", 10580 # ], 10581 # }, 10582 # { 10583 # "transcripts_column": "Ensembl_transcriptid", 10584 # "transcripts_infos_columns": [ 10585 # "genename", 10586 # "VARITY_R_score", 10587 # "Aloft_pred", 10588 # ], 10589 # }, 10590 # ], 10591 10592 # Init 10593 if temporary_tables is None: 10594 temporary_tables = [] 10595 if annotation_fields is None: 10596 annotation_fields = [] 10597 10598 # Variants table 10599 table_variants = self.get_table_variants() 10600 10601 for columns_map in columns_maps: 10602 10603 # Log 10604 log.debug(f"columns_map={columns_map}") 10605 10606 # Transcript column 10607 transcripts_column = columns_map.get("transcripts_column", None) 10608 10609 # Transcripts infos columns 10610 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 10611 10612 # Transcripts infos columns rename 10613 column_rename = columns_map.get("column_rename", column_rename) 10614 10615 # Transcripts infos columns clean 10616 column_clean = columns_map.get("column_clean", column_clean) 10617 10618 # Transcripts infos columns case 10619 column_case = columns_map.get("column_case", column_case) 10620 10621 if transcripts_column is not None: 10622 10623 # Explode 10624 added_columns += self.explode_infos( 10625 fields=[transcripts_column] + transcripts_infos_columns 10626 ) 10627 10628 # View clauses 10629 clause_select_variants = [] 10630 clause_select_tanscripts = [] 10631 for field in [transcripts_column] + transcripts_infos_columns: 10632 10633 # AS field 10634 as_field = field 10635 10636 # Rename 10637 if column_rename: 10638 as_field = column_rename.get(as_field, as_field) 10639 10640 # Clean 10641 if column_clean: 10642 as_field = clean_annotation_field(as_field) 10643 10644 # Case 10645 if column_case: 10646 if column_case.lower() in ["lower"]: 10647 as_field = as_field.lower() 10648 elif column_case.lower() in ["upper"]: 10649 as_field = as_field.upper() 10650 10651 # Clause select Variants 10652 clause_select_variants.append( 10653 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10654 ) 10655 10656 if field in [transcripts_column]: 10657 clause_select_tanscripts.append( 10658 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10659 ) 10660 else: 10661 clause_select_tanscripts.append( 10662 f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """ 10663 ) 10664 annotation_fields.append(as_field) 10665 10666 # Query View 10667 query = f""" 10668 SELECT 10669 "#CHROM", POS, REF, ALT, INFO, 10670 "{transcripts_column}" AS 'transcript', 10671 {", ".join(clause_select_tanscripts)} 10672 FROM ( 10673 SELECT 10674 "#CHROM", POS, REF, ALT, INFO, 10675 {", ".join(clause_select_variants)} 10676 FROM {table_variants} 10677 ) 10678 WHERE "{transcripts_column}" IS NOT NULL 10679 """ 10680 10681 # Create temporary table 10682 temporary_table = transcripts_table + "".join( 10683 random.choices(string.ascii_uppercase + string.digits, k=10) 10684 ) 10685 10686 # # Temporary_tables 10687 # temporary_tables.append(temporary_table) 10688 # query_view = f""" 10689 # CREATE TEMPORARY TABLE {temporary_table} 10690 # AS ({query}) 10691 # """ 10692 # self.execute_query(query=query_view) 10693 10694 # Temporary_tables 10695 temporary_tables.append(temporary_table) 10696 10697 # List of unique #CHROM 10698 query_unique_chrom = f""" 10699 SELECT DISTINCT "#CHROM" 10700 FROM variants 10701 """ 10702 unique_chroms = self.get_query_to_df(query=query_unique_chrom) 10703 10704 # Create table with structure but without data 10705 query_create_table = f""" 10706 CREATE TABLE {temporary_table} 10707 AS ({query} LIMIT 0) 10708 """ 10709 self.execute_query(query=query_create_table) 10710 10711 # Process by #CHROM 10712 for chrom in unique_chroms["#CHROM"]: 10713 10714 # Log 10715 log.debug(f"Processing #CHROM={chrom}") 10716 10717 # Select data by #CHROM 10718 query_chunk = f""" 10719 SELECT * 10720 FROM ({query}) 10721 WHERE "#CHROM" = '{chrom}' 10722 """ 10723 10724 # Insert data 10725 query_insert_chunk = f""" 10726 INSERT INTO {temporary_table} 10727 {query_chunk} 10728 """ 10729 self.execute_query(query=query_insert_chunk) 10730 10731 return added_columns, temporary_tables, annotation_fields
The create_transcript_view_from_columns_map function generates a temporary table view based on
specified columns mapping for transcripts data.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts - columns_maps: The
columns_mapsparameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in thecolumns_mapslist represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns - added_columns: The
added_columnsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from thetranscripts_columnandtranscripts_infos_columnsspecified in the `columns - column_rename: The
column_renameparameter in thecreate_transcript_view_from_columns_mapfunction is a dictionary that allows you to specify custom renaming for columns during the creation of the temporary table view. This parameter provides a mapping of original column names to the desired renamed column names. By using this parameter, - column_clean: The
column_cleanparameter in thecreate_transcript_view_from_columns_mapfunction is a boolean flag that determines whether the column values should be cleaned or not. If set toTrue, the column values will be cleaned by removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to False - column_case: The
column_caseparameter in thecreate_transcript_view_from_columns_mapfunction is used to specify the case transformation to be applied to the columns during the view creation process. It allows you to control whether the column values should be converted to lowercase, uppercase, or remain unchanged
Returns
The
create_transcript_view_from_columns_mapfunction returns a tuple containing three lists:added_columns,temporary_tables, andannotation_fields.
10733 def create_transcript_view_from_column_format( 10734 self, 10735 transcripts_table: str = "transcripts", 10736 column_formats: dict = {}, 10737 temporary_tables: list = None, 10738 annotation_fields: list = None, 10739 column_rename: dict = {}, 10740 column_clean: bool = False, 10741 column_case: str = None, 10742 ) -> tuple[list, list, list]: 10743 """ 10744 The `create_transcript_view_from_column_format` function generates a transcript view based on 10745 specified column formats, adds additional columns and annotation fields, and returns the list of 10746 temporary tables and annotation fields. 10747 10748 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10749 of the table containing the transcripts data. This table will be used as the base table for 10750 creating the transcript view. The default value for this parameter is "transcripts", but you can 10751 provide a different table name if needed, defaults to transcripts 10752 :type transcripts_table: str (optional) 10753 :param column_formats: The `column_formats` parameter is a dictionary that contains information 10754 about the columns to be used for creating the transcript view. Each entry in the dictionary 10755 specifies the mapping between a transcripts column and a transcripts infos column. This 10756 parameter allows you to define how the columns from the transcripts table should be transformed 10757 or mapped 10758 :type column_formats: dict 10759 :param temporary_tables: The `temporary_tables` parameter in the 10760 `create_transcript_view_from_column_format` function is a list that stores the names of 10761 temporary views created during the process of creating a transcript view from a column format. 10762 These temporary views are used to manipulate and extract data before generating the final 10763 transcript view 10764 :type temporary_tables: list 10765 :param annotation_fields: The `annotation_fields` parameter in the 10766 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 10767 that are extracted from the temporary views created during the process. These annotation fields 10768 are obtained by querying the temporary views and extracting the column names excluding specific 10769 columns like `#CH 10770 :type annotation_fields: list 10771 :param column_rename: The `column_rename` parameter in the 10772 `create_transcript_view_from_column_format` function is a dictionary that allows you to specify 10773 custom renaming of columns in the transcripts infos table. By providing a mapping of original 10774 column names to new column names in this dictionary, you can rename specific columns during the 10775 process 10776 :type column_rename: dict 10777 :param column_clean: The `column_clean` parameter in the 10778 `create_transcript_view_from_column_format` function is a boolean flag that determines whether 10779 the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns 10780 will be cleaned during the creation of the transcript view based on the specified column format, 10781 defaults to False 10782 :type column_clean: bool (optional) 10783 :param column_case: The `column_case` parameter in the 10784 `create_transcript_view_from_column_format` function is used to specify the case transformation 10785 to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" 10786 to convert the column names to uppercase or lowercase, respectively 10787 :type column_case: str 10788 :return: The `create_transcript_view_from_column_format` function returns two lists: 10789 `temporary_tables` and `annotation_fields`. 10790 """ 10791 10792 log.debug("Start transcrpts view creation from column format...") 10793 10794 # "from_column_format": [ 10795 # { 10796 # "transcripts_column": "ANN", 10797 # "transcripts_infos_column": "Feature_ID", 10798 # } 10799 # ], 10800 10801 # Init 10802 if temporary_tables is None: 10803 temporary_tables = [] 10804 if annotation_fields is None: 10805 annotation_fields = [] 10806 10807 for column_format in column_formats: 10808 10809 # annotation field and transcript annotation field 10810 annotation_field = column_format.get("transcripts_column", "ANN") 10811 transcript_annotation = column_format.get( 10812 "transcripts_infos_column", "Feature_ID" 10813 ) 10814 10815 # Transcripts infos columns rename 10816 column_rename = column_format.get("column_rename", column_rename) 10817 10818 # Transcripts infos columns clean 10819 column_clean = column_format.get("column_clean", column_clean) 10820 10821 # Transcripts infos columns case 10822 column_case = column_format.get("column_case", column_case) 10823 10824 # Temporary View name 10825 temporary_view_name = transcripts_table + "".join( 10826 random.choices(string.ascii_uppercase + string.digits, k=10) 10827 ) 10828 10829 # Create temporary view name 10830 temporary_view_name = self.annotation_format_to_table( 10831 uniquify=True, 10832 annotation_field=annotation_field, 10833 view_name=temporary_view_name, 10834 annotation_id=transcript_annotation, 10835 column_rename=column_rename, 10836 column_clean=column_clean, 10837 column_case=column_case, 10838 ) 10839 10840 # Annotation fields 10841 if temporary_view_name: 10842 query_annotation_fields = f""" 10843 SELECT * 10844 FROM ( 10845 DESCRIBE SELECT * 10846 FROM {temporary_view_name} 10847 ) 10848 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 10849 """ 10850 df_annotation_fields = self.get_query_to_df( 10851 query=query_annotation_fields 10852 ) 10853 10854 # Add temporary view and annotation fields 10855 temporary_tables.append(temporary_view_name) 10856 annotation_fields += list(set(df_annotation_fields["column_name"])) 10857 10858 return temporary_tables, annotation_fields
The create_transcript_view_from_column_format function generates a transcript view based on
specified column formats, adds additional columns and annotation fields, and returns the list of
temporary tables and annotation fields.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts - column_formats: The
column_formatsparameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. This parameter allows you to define how the columns from the transcripts table should be transformed or mapped - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH - column_rename: The
column_renameparameter in thecreate_transcript_view_from_column_formatfunction is a dictionary that allows you to specify custom renaming of columns in the transcripts infos table. By providing a mapping of original column names to new column names in this dictionary, you can rename specific columns during the process - column_clean: The
column_cleanparameter in thecreate_transcript_view_from_column_formatfunction is a boolean flag that determines whether the transcripts infos columns should undergo a cleaning process. If set toTrue, the columns will be cleaned during the creation of the transcript view based on the specified column format, defaults to False - column_case: The
column_caseparameter in thecreate_transcript_view_from_column_formatfunction is used to specify the case transformation to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" to convert the column names to uppercase or lowercase, respectively
Returns
The
create_transcript_view_from_column_formatfunction returns two lists:temporary_tablesandannotation_fields.
10860 def create_transcript_view( 10861 self, 10862 transcripts_table: str = None, 10863 transcripts_table_drop: bool = False, 10864 param: dict = {}, 10865 ) -> str: 10866 """ 10867 The `create_transcript_view` function generates a transcript view by processing data from a 10868 specified table based on provided parameters and structural information. 10869 10870 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 10871 is used to specify the name of the table that will store the final transcript view data. If a table 10872 name is not provided, the function will create a new table to store the transcript view data, and by 10873 default,, defaults to transcripts 10874 :type transcripts_table: str (optional) 10875 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 10876 `create_transcript_view` function is a boolean parameter that determines whether to drop the 10877 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 10878 the function will drop the existing transcripts table if it exists, defaults to False 10879 :type transcripts_table_drop: bool (optional) 10880 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 10881 contains information needed to create a transcript view. It includes details such as the structure 10882 of the transcripts, columns mapping, column formats, and other necessary information for generating 10883 the view. This parameter allows for flexibility and customization 10884 :type param: dict 10885 :return: The `create_transcript_view` function returns the name of the transcripts table that was 10886 created or modified during the execution of the function. 10887 """ 10888 10889 log.debug("Start transcripts view creation...") 10890 10891 # Default 10892 transcripts_table_default = "transcripts" 10893 10894 # Param 10895 if not param: 10896 param = self.get_param() 10897 10898 # Struct 10899 struct = param.get("transcripts", {}).get("struct", None) 10900 10901 # Transcript veresion 10902 transcript_id_remove_version = param.get("transcripts", {}).get( 10903 "transcript_id_remove_version", False 10904 ) 10905 10906 # Transcripts mapping 10907 transcript_id_mapping_file = param.get("transcripts", {}).get( 10908 "transcript_id_mapping_file", None 10909 ) 10910 10911 # Transcripts mapping 10912 transcript_id_mapping_force = param.get("transcripts", {}).get( 10913 "transcript_id_mapping_force", None 10914 ) 10915 10916 # Transcripts table 10917 if transcripts_table is None: 10918 transcripts_table = param.get("transcripts", {}).get( 10919 "table", transcripts_table_default 10920 ) 10921 10922 # Check transcripts table exists 10923 if transcripts_table: 10924 10925 # Query to check if transcripts table exists 10926 query_check_table = f""" 10927 SELECT * 10928 FROM information_schema.tables 10929 WHERE table_name = '{transcripts_table}' 10930 """ 10931 df_check_table = self.get_query_to_df(query=query_check_table) 10932 10933 # Check if transcripts table exists 10934 if len(df_check_table) > 0 and not transcripts_table_drop: 10935 log.debug(f"Table {transcripts_table} exists and not drop option") 10936 return transcripts_table 10937 10938 if struct: 10939 10940 # added_columns 10941 added_columns = [] 10942 10943 # Temporary tables 10944 temporary_tables = [] 10945 10946 # Annotation fields 10947 annotation_fields = [] 10948 10949 # from columns map 10950 columns_maps = struct.get("from_columns_map", []) 10951 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10952 self.create_transcript_view_from_columns_map( 10953 transcripts_table=transcripts_table, 10954 columns_maps=columns_maps, 10955 added_columns=added_columns, 10956 temporary_tables=temporary_tables, 10957 annotation_fields=annotation_fields, 10958 ) 10959 ) 10960 added_columns += added_columns_tmp 10961 temporary_tables += temporary_tables_tmp 10962 annotation_fields += annotation_fields_tmp 10963 10964 # from column format 10965 column_formats = struct.get("from_column_format", []) 10966 temporary_tables_tmp, annotation_fields_tmp = ( 10967 self.create_transcript_view_from_column_format( 10968 transcripts_table=transcripts_table, 10969 column_formats=column_formats, 10970 temporary_tables=temporary_tables, 10971 annotation_fields=annotation_fields, 10972 ) 10973 ) 10974 temporary_tables += temporary_tables_tmp 10975 annotation_fields += annotation_fields_tmp 10976 10977 # Remove some specific fields/column 10978 annotation_fields = list(set(annotation_fields)) 10979 for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]: 10980 if field in annotation_fields: 10981 annotation_fields.remove(field) 10982 10983 # Merge temporary tables query 10984 query_merge = "" 10985 for temporary_table in list(set(temporary_tables)): 10986 10987 # First temporary table 10988 if not query_merge: 10989 query_merge = f""" 10990 SELECT * FROM {temporary_table} 10991 """ 10992 # other temporary table (using UNION) 10993 else: 10994 query_merge += f""" 10995 UNION BY NAME SELECT * FROM {temporary_table} 10996 """ 10997 10998 # transcript table tmp 10999 transcript_table_tmp = "transcripts_tmp" 11000 transcript_table_tmp2 = "transcripts_tmp2" 11001 transcript_table_tmp3 = "transcripts_tmp3" 11002 11003 # Merge on transcript 11004 query_merge_on_transcripts_annotation_fields = [] 11005 11006 # Add transcript list 11007 query_merge_on_transcripts_annotation_fields.append( 11008 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """ 11009 ) 11010 11011 # Aggregate all annotations fields 11012 for annotation_field in set(annotation_fields): 11013 query_merge_on_transcripts_annotation_fields.append( 11014 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """ 11015 ) 11016 11017 # Transcripts mapping 11018 if transcript_id_mapping_file: 11019 11020 # Transcript dataframe 11021 transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe" 11022 transcript_id_mapping_dataframe = transcripts_file_to_df( 11023 transcript_id_mapping_file, column_names=["transcript", "alias"] 11024 ) 11025 11026 # Transcript version remove 11027 if transcript_id_remove_version: 11028 query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped" 11029 query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)" 11030 query_left_join = f""" 11031 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 11032 """ 11033 else: 11034 query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped" 11035 query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript" 11036 query_left_join = f""" 11037 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 11038 """ 11039 11040 # Transcript column for group by merge 11041 query_transcript_merge_group_by = """ 11042 CASE 11043 WHEN transcript_mapped NOT IN ('') 11044 THEN split_part(transcript_mapped, '.', 1) 11045 ELSE split_part(transcript_original, '.', 1) 11046 END 11047 """ 11048 11049 # Merge query 11050 transcripts_tmp2_query = f""" 11051 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)} 11052 FROM ({query_merge}) AS {transcript_table_tmp} 11053 {query_left_join} 11054 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by} 11055 """ 11056 11057 # Retrive columns after mege 11058 transcripts_tmp2_describe_query = f""" 11059 DESCRIBE {transcripts_tmp2_query} 11060 """ 11061 transcripts_tmp2_describe_list = list( 11062 self.get_query_to_df(query=transcripts_tmp2_describe_query)[ 11063 "column_name" 11064 ] 11065 ) 11066 11067 # Create list of columns for select clause 11068 transcripts_tmp2_describe_select_clause = [] 11069 for field in transcripts_tmp2_describe_list: 11070 if field not in [ 11071 "#CHROM", 11072 "POS", 11073 "REF", 11074 "ALT", 11075 "INFO", 11076 "transcript_mapped", 11077 ]: 11078 as_field = field 11079 if field in ["transcript_original"]: 11080 as_field = "transcripts_mapped" 11081 transcripts_tmp2_describe_select_clause.append( 11082 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """ 11083 ) 11084 11085 # Merge with mapping 11086 query_merge_on_transcripts = f""" 11087 SELECT 11088 "#CHROM", POS, REF, ALT, INFO, 11089 CASE 11090 WHEN ANY_VALUE(transcript_mapped) NOT IN ('') 11091 THEN ANY_VALUE(transcript_mapped) 11092 ELSE ANY_VALUE(transcript_original) 11093 END AS transcript, 11094 {", ".join(transcripts_tmp2_describe_select_clause)} 11095 FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2} 11096 GROUP BY "#CHROM", POS, REF, ALT, INFO, 11097 {query_transcript_merge_group_by} 11098 """ 11099 11100 # Add transcript filter from mapping file 11101 if transcript_id_mapping_force: 11102 query_merge_on_transcripts = f""" 11103 SELECT * 11104 FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3} 11105 WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe) 11106 """ 11107 11108 # No transcript mapping 11109 else: 11110 11111 # Remove transcript version 11112 if transcript_id_remove_version: 11113 query_transcript_column = f""" 11114 split_part({transcript_table_tmp}.transcript, '.', 1) 11115 """ 11116 else: 11117 query_transcript_column = """ 11118 transcript 11119 """ 11120 11121 # Query sections 11122 query_transcript_column_select = ( 11123 f"{query_transcript_column} AS transcript" 11124 ) 11125 query_transcript_column_group_by = query_transcript_column 11126 11127 # Query for transcripts view 11128 query_merge_on_transcripts = f""" 11129 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)} 11130 FROM ({query_merge}) AS {transcript_table_tmp} 11131 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} 11132 """ 11133 11134 # Drop transcript view is necessary 11135 if transcripts_table_drop: 11136 query_drop = f""" 11137 DROP TABLE IF EXISTS {transcripts_table}; 11138 """ 11139 self.execute_query(query=query_drop) 11140 11141 # # Merge and create transcript view 11142 # query_create_view = f""" 11143 # CREATE TABLE IF NOT EXISTS {transcripts_table} 11144 # AS {query_merge_on_transcripts} 11145 # """ 11146 # self.execute_query(query=query_create_view) 11147 11148 # Using #CHROM chunk 11149 ###### 11150 11151 # List of unique #CHROM 11152 query_unique_chrom = f""" 11153 SELECT DISTINCT "#CHROM" 11154 FROM variants AS subquery 11155 """ 11156 unique_chroms = self.get_query_to_df(query=query_unique_chrom) 11157 11158 # Create table with structure but without data, if not exists 11159 query_create_table = f""" 11160 CREATE TABLE IF NOT EXISTS {transcripts_table} AS 11161 SELECT * FROM ({query_merge_on_transcripts}) AS subquery LIMIT 0 11162 """ 11163 self.execute_query(query=query_create_table) 11164 11165 # Process by #CHROM 11166 for chrom in unique_chroms["#CHROM"]: 11167 11168 # Log 11169 log.debug(f"Processing #CHROM={chrom}") 11170 11171 # Select data by #CHROM 11172 query_chunk = f""" 11173 SELECT * 11174 FROM ({query_merge_on_transcripts}) 11175 WHERE "#CHROM" = '{chrom}' 11176 """ 11177 11178 # Insert data 11179 query_insert_chunk = f""" 11180 INSERT INTO {transcripts_table} 11181 {query_chunk} 11182 """ 11183 self.execute_query(query=query_insert_chunk) 11184 11185 # Remove temporary tables 11186 if temporary_tables: 11187 for temporary_table in list(set(temporary_tables)): 11188 query_drop_tmp_table = f""" 11189 DROP TABLE IF EXISTS {temporary_table} 11190 """ 11191 self.execute_query(query=query_drop_tmp_table) 11192 11193 # Remove added columns 11194 for added_column in added_columns: 11195 self.drop_column(column=added_column) 11196 11197 else: 11198 11199 transcripts_table = None 11200 11201 return transcripts_table
The create_transcript_view function generates a transcript view by processing data from a
specified table based on provided parameters and structural information.
Parameters
- transcripts_table: The
transcripts_tableparameter in thecreate_transcript_viewfunction is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts - transcripts_table_drop: The
transcripts_table_dropparameter in thecreate_transcript_viewfunction is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. Iftranscripts_table_dropis set toTrue, the function will drop the existing transcripts table if it exists, defaults to False - param: The
paramparameter in thecreate_transcript_viewfunction is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns
The
create_transcript_viewfunction returns the name of the transcripts table that was created or modified during the execution of the function.
11203 def annotation_format_to_table( 11204 self, 11205 uniquify: bool = True, 11206 annotation_field: str = "ANN", 11207 annotation_id: str = "Feature_ID", 11208 view_name: str = "transcripts", 11209 column_rename: dict = {}, 11210 column_clean: bool = False, 11211 column_case: str = None, 11212 ) -> str: 11213 """ 11214 The `annotation_format_to_table` function converts annotation data from a VCF file into a 11215 structured table format, ensuring unique values and creating a temporary table for further 11216 processing or analysis. 11217 11218 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure 11219 unique values in the output or not. If set to `True`, the function will make sure that the 11220 output values are unique, defaults to True 11221 :type uniquify: bool (optional) 11222 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file 11223 that contains the annotation information for each variant. This field is used to extract the 11224 annotation details for further processing in the function. By default, it is set to "ANN", 11225 defaults to ANN 11226 :type annotation_field: str (optional) 11227 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method 11228 is used to specify the identifier for the annotation feature. This identifier will be used as a 11229 column name in the resulting table or view that is created based on the annotation data. It 11230 helps in uniquely identifying each annotation entry in the, defaults to Feature_ID 11231 :type annotation_id: str (optional) 11232 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used 11233 to specify the name of the temporary table that will be created to store the transformed 11234 annotation data. This table will hold the extracted information from the annotation field in a 11235 structured format for further processing or analysis. By default,, defaults to transcripts 11236 :type view_name: str (optional) 11237 :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method 11238 is a dictionary that allows you to specify custom renaming for columns. By providing key-value 11239 pairs in this dictionary, you can rename specific columns in the resulting table or view that is 11240 created based on the annotation data. This feature enables 11241 :type column_rename: dict 11242 :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is 11243 a boolean flag that determines whether the annotation field should undergo a cleaning process. 11244 If set to `True`, the function will clean the annotation field before further processing. This 11245 cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults 11246 to False 11247 :type column_clean: bool (optional) 11248 :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is 11249 used to specify the case transformation to be applied to the column names extracted from the 11250 annotation data. It allows you to set the case of the column names to either lowercase or 11251 uppercase for consistency or other specific requirements during the conversion 11252 :type column_case: str 11253 :return: The function `annotation_format_to_table` is returning the name of the view created, 11254 which is stored in the variable `view_name`. 11255 """ 11256 11257 # Annotation field 11258 annotation_format = "annotation_explode" 11259 11260 # Transcript annotation 11261 if column_rename: 11262 annotation_id = column_rename.get(annotation_id, annotation_id) 11263 11264 if column_clean: 11265 annotation_id = clean_annotation_field(annotation_id) 11266 11267 # Prefix 11268 prefix = self.get_explode_infos_prefix() 11269 if prefix: 11270 prefix = "INFO/" 11271 11272 # Annotation fields 11273 annotation_infos = prefix + annotation_field 11274 annotation_format_infos = prefix + annotation_format 11275 11276 # Variants table 11277 table_variants = self.get_table_variants() 11278 11279 # Header 11280 vcf_reader = self.get_header() 11281 11282 # Add columns 11283 added_columns = [] 11284 11285 # Explode HGVS field in column 11286 added_columns += self.explode_infos(fields=[annotation_field]) 11287 11288 if annotation_field in vcf_reader.infos: 11289 11290 # Extract ANN header 11291 ann_description = vcf_reader.infos[annotation_field].desc 11292 pattern = r"'(.+?)'" 11293 match = re.search(pattern, ann_description) 11294 if match: 11295 ann_header_match = match.group(1).split(" | ") 11296 ann_header = [] 11297 ann_header_desc = {} 11298 for i in range(len(ann_header_match)): 11299 ann_header_info = "".join( 11300 char for char in ann_header_match[i] if char.isalnum() 11301 ) 11302 ann_header.append(ann_header_info) 11303 ann_header_desc[ann_header_info] = ann_header_match[i] 11304 if not ann_header_desc: 11305 raise ValueError("Invalid header description format") 11306 else: 11307 raise ValueError("Invalid header description format") 11308 11309 # Create variant id 11310 variant_id_column = self.get_variant_id_column() 11311 added_columns += [variant_id_column] 11312 11313 # Get list of #CHROM 11314 query_unique_chrom = f""" 11315 SELECT DISTINCT "#CHROM" 11316 FROM variants AS subquery 11317 """ 11318 unique_chroms = self.get_query_to_df(query=query_unique_chrom) 11319 11320 # Base for database anontation format 11321 dataframe_annotation_format_base = f""" 11322 SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" 11323 FROM {table_variants} 11324 """ 11325 11326 # Create dataframe for keys column type 11327 dataframe_annotation_format = self.get_query_to_df( 11328 f""" {dataframe_annotation_format_base} LIMIT 1000 """ 11329 ) 11330 11331 # Define a vectorized function to apply explode_annotation_format 11332 vectorized_explode_annotation_format = np.vectorize( 11333 lambda x: explode_annotation_format( 11334 annotation=str(x), 11335 uniquify=uniquify, 11336 output_format="JSON", 11337 prefix="", 11338 header=list(ann_header_desc.values()), 11339 ) 11340 ) 11341 11342 # Assign the exploded annotations back to the dataframe 11343 dataframe_annotation_format[annotation_format_infos] = ( 11344 vectorized_explode_annotation_format( 11345 dataframe_annotation_format[annotation_infos].to_numpy() 11346 ) 11347 ) 11348 11349 # Find keys 11350 query_json = f""" 11351 SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' 11352 FROM dataframe_annotation_format; 11353 """ 11354 df_keys = self.get_query_to_df(query=query_json) 11355 11356 # Check keys 11357 query_json_key = [] 11358 for _, row in df_keys.iterrows(): 11359 11360 # Key 11361 key = row.iloc[0] 11362 key_clean = key 11363 11364 # key rename 11365 if column_rename: 11366 key_clean = column_rename.get(key_clean, key_clean) 11367 11368 # key clean 11369 if column_clean: 11370 key_clean = clean_annotation_field(key_clean) 11371 11372 # Key case 11373 if column_case: 11374 if column_case.lower() in ["lower"]: 11375 key_clean = key_clean.lower() 11376 elif column_case.lower() in ["upper"]: 11377 key_clean = key_clean.upper() 11378 11379 # Type 11380 query_json_type = f""" 11381 SELECT * 11382 FROM ( 11383 SELECT 11384 NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '') AS '{key_clean}' 11385 FROM 11386 dataframe_annotation_format 11387 ) 11388 WHERE "{key_clean}" NOT NULL AND "{key_clean}" NOT IN ('') 11389 """ 11390 11391 # Get DataFrame from query 11392 df_json_type = self.get_query_to_df(query=query_json_type) 11393 11394 # Detect column type 11395 column_type = detect_column_type(df_json_type[key_clean]) 11396 11397 # Free up memory 11398 del df_json_type 11399 11400 # Append 11401 query_json_key.append( 11402 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 11403 ) 11404 11405 # Create table with structure but without data, if not exists 11406 query_create_table = f""" 11407 CREATE TABLE IF NOT EXISTS {view_name} 11408 AS ( 11409 SELECT *, {annotation_id} AS 'transcript' 11410 FROM ( 11411 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 11412 FROM dataframe_annotation_format 11413 ) 11414 LIMIT 0 11415 ); 11416 """ 11417 self.execute_query(query=query_create_table) 11418 11419 # Free up memory 11420 del dataframe_annotation_format 11421 11422 # Insert data by chromosome 11423 for chrom in unique_chroms["#CHROM"]: 11424 11425 # Log 11426 log.debug(f"Processing #CHROM={chrom}") 11427 11428 # Create dataframe 11429 dataframe_annotation_format = self.get_query_to_df( 11430 f""" {dataframe_annotation_format_base} WHERE "#CHROM" = '{chrom}' """ 11431 ) 11432 11433 # Define a vectorized function to apply explode_annotation_format 11434 vectorized_explode_annotation_format = np.vectorize( 11435 lambda x: explode_annotation_format( 11436 annotation=str(x), 11437 uniquify=uniquify, 11438 output_format="JSON", 11439 prefix="", 11440 header=list(ann_header_desc.values()), 11441 ) 11442 ) 11443 11444 # Assign the exploded annotations back to the dataframe 11445 dataframe_annotation_format[annotation_format_infos] = ( 11446 vectorized_explode_annotation_format( 11447 dataframe_annotation_format[annotation_infos].to_numpy() 11448 ) 11449 ) 11450 11451 # Insert data into tmp table 11452 query_insert_chunk = f""" 11453 INSERT INTO {view_name} 11454 SELECT *, {annotation_id} AS 'transcript' 11455 FROM ( 11456 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 11457 FROM dataframe_annotation_format 11458 ) 11459 """ 11460 self.execute_query(query=query_insert_chunk) 11461 11462 # Free up memory 11463 del dataframe_annotation_format 11464 11465 else: 11466 11467 # Return None 11468 view_name = None 11469 11470 # Remove added columns 11471 for added_column in added_columns: 11472 self.drop_column(column=added_column) 11473 11474 return view_name
The annotation_format_to_table function converts annotation data from a VCF file into a
structured table format, ensuring unique values and creating a temporary table for further
processing or analysis.
Parameters
- uniquify: The
uniquifyparameter is a boolean flag that determines whether to ensure unique values in the output or not. If set toTrue, the function will make sure that the output values are unique, defaults to True - annotation_field: The
annotation_fieldparameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function. By default, it is set to "ANN", defaults to ANN - annotation_id: The
annotation_idparameter in theannotation_format_to_tablemethod is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID - view_name: The
view_nameparameter in theannotation_format_to_tablemethod is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis. By default,, defaults to transcripts - column_rename: The
column_renameparameter in theannotation_format_to_tablemethod is a dictionary that allows you to specify custom renaming for columns. By providing key-value pairs in this dictionary, you can rename specific columns in the resulting table or view that is created based on the annotation data. This feature enables - column_clean: The
column_cleanparameter in theannotation_format_to_tablemethod is a boolean flag that determines whether the annotation field should undergo a cleaning process. If set toTrue, the function will clean the annotation field before further processing. This cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults to False - column_case: The
column_caseparameter in theannotation_format_to_tablemethod is used to specify the case transformation to be applied to the column names extracted from the annotation data. It allows you to set the case of the column names to either lowercase or uppercase for consistency or other specific requirements during the conversion
Returns
The function
annotation_format_to_tableis returning the name of the view created, which is stored in the variableview_name.
11476 def transcript_view_to_variants( 11477 self, 11478 transcripts_table: str = None, 11479 transcripts_column_id: str = None, 11480 transcripts_info_json: str = None, 11481 transcripts_info_field_json: str = None, 11482 transcripts_info_format: str = None, 11483 transcripts_info_field_format: str = None, 11484 param: dict = {}, 11485 ) -> bool: 11486 """ 11487 The `transcript_view_to_variants` function updates a variants table with information from 11488 transcripts in JSON format. 11489 11490 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 11491 table containing the transcripts data. If this parameter is not provided, the function will 11492 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 11493 :type transcripts_table: str 11494 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 11495 column in the `transcripts_table` that contains the unique identifier for each transcript. This 11496 identifier is used to match transcripts with variants in the database 11497 :type transcripts_column_id: str 11498 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 11499 of the column in the variants table where the transcripts information will be stored in JSON 11500 format. This parameter allows you to define the column in the variants table that will hold the 11501 JSON-formatted information about transcripts 11502 :type transcripts_info_json: str 11503 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 11504 specify the field in the VCF header that will contain information about transcripts in JSON 11505 format. This field will be added to the VCF header as an INFO field with the specified name 11506 :type transcripts_info_field_json: str 11507 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 11508 format of the information about transcripts that will be stored in the variants table. This 11509 format can be used to define how the transcript information will be structured or displayed 11510 within the variants table 11511 :type transcripts_info_format: str 11512 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 11513 specify the field in the VCF header that will contain information about transcripts in a 11514 specific format. This field will be added to the VCF header as an INFO field with the specified 11515 name 11516 :type transcripts_info_field_format: str 11517 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 11518 that contains various configuration settings related to transcripts. It is used to provide 11519 default values for certain parameters if they are not explicitly provided when calling the 11520 method. The `param` dictionary can be passed as an argument 11521 :type param: dict 11522 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 11523 if the operation is successful and `False` if certain conditions are not met. 11524 """ 11525 11526 msg_info_prefix = "Start transcripts view to variants annotations" 11527 11528 log.debug(f"{msg_info_prefix}...") 11529 11530 # Default 11531 transcripts_table_default = "transcripts" 11532 transcripts_column_id_default = "transcript" 11533 transcripts_info_json_default = None 11534 transcripts_info_format_default = None 11535 transcripts_info_field_json_default = None 11536 transcripts_info_field_format_default = None 11537 11538 # Param 11539 if not param: 11540 param = self.get_param() 11541 11542 # Transcripts table 11543 if transcripts_table is None: 11544 transcripts_table = param.get("transcripts", {}).get( 11545 "table", transcripts_table_default 11546 ) 11547 11548 # Transcripts column ID 11549 if transcripts_column_id is None: 11550 transcripts_column_id = param.get("transcripts", {}).get( 11551 "column_id", transcripts_column_id_default 11552 ) 11553 11554 # Transcripts info json 11555 if transcripts_info_json is None: 11556 transcripts_info_json = param.get("transcripts", {}).get( 11557 "transcripts_info_json", transcripts_info_json_default 11558 ) 11559 11560 # Transcripts info field JSON 11561 if transcripts_info_field_json is None: 11562 transcripts_info_field_json = param.get("transcripts", {}).get( 11563 "transcripts_info_field_json", transcripts_info_field_json_default 11564 ) 11565 # if transcripts_info_field_json is not None and transcripts_info_json is None: 11566 # transcripts_info_json = transcripts_info_field_json 11567 11568 # Transcripts info format 11569 if transcripts_info_format is None: 11570 transcripts_info_format = param.get("transcripts", {}).get( 11571 "transcripts_info_format", transcripts_info_format_default 11572 ) 11573 11574 # Transcripts info field FORMAT 11575 if transcripts_info_field_format is None: 11576 transcripts_info_field_format = param.get("transcripts", {}).get( 11577 "transcripts_info_field_format", transcripts_info_field_format_default 11578 ) 11579 # if ( 11580 # transcripts_info_field_format is not None 11581 # and transcripts_info_format is None 11582 # ): 11583 # transcripts_info_format = transcripts_info_field_format 11584 11585 # Variants table 11586 table_variants = self.get_table_variants() 11587 11588 # Check info columns param 11589 if ( 11590 transcripts_info_json is None 11591 and transcripts_info_field_json is None 11592 and transcripts_info_format is None 11593 and transcripts_info_field_format is None 11594 ): 11595 return False 11596 11597 # Transcripts infos columns 11598 query_transcripts_infos_columns = f""" 11599 SELECT * 11600 FROM ( 11601 DESCRIBE SELECT * FROM {transcripts_table} 11602 ) 11603 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 11604 """ 11605 transcripts_infos_columns = list( 11606 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 11607 ) 11608 11609 # View results 11610 clause_select = [] 11611 clause_to_json = [] 11612 clause_to_format = [] 11613 for field in transcripts_infos_columns: 11614 # Do not consider INFO field for export into fields 11615 if field not in ["INFO"]: 11616 clause_select.append( 11617 f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """ 11618 ) 11619 clause_to_json.append(f""" '{field}': "{field}" """) 11620 clause_to_format.append(f""" "{field}" """) 11621 11622 # Update 11623 update_set_json = [] 11624 update_set_format = [] 11625 11626 # VCF header 11627 vcf_reader = self.get_header() 11628 11629 # Transcripts to info column in JSON 11630 if transcripts_info_json: 11631 11632 # Create column on variants table 11633 self.add_column( 11634 table_name=table_variants, 11635 column_name=transcripts_info_json, 11636 column_type="JSON", 11637 default_value=None, 11638 drop=False, 11639 ) 11640 11641 # Add header 11642 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 11643 transcripts_info_json, 11644 ".", 11645 "String", 11646 "Transcripts in JSON format", 11647 "unknwon", 11648 "unknwon", 11649 self.code_type_map["String"], 11650 ) 11651 11652 # Add to update 11653 update_set_json.append( 11654 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 11655 ) 11656 11657 # Transcripts to info field in JSON 11658 if transcripts_info_field_json: 11659 11660 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 11661 11662 # Add to update 11663 update_set_json.append( 11664 f""" 11665 INFO = concat( 11666 CASE 11667 WHEN INFO NOT IN ('', '.') 11668 THEN INFO 11669 ELSE '' 11670 END, 11671 CASE 11672 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 11673 THEN concat( 11674 ';{transcripts_info_field_json}=', 11675 t.{transcripts_info_json} 11676 ) 11677 ELSE '' 11678 END 11679 ) 11680 """ 11681 ) 11682 11683 # Add header 11684 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 11685 transcripts_info_field_json, 11686 ".", 11687 "String", 11688 "Transcripts in JSON format", 11689 "unknwon", 11690 "unknwon", 11691 self.code_type_map["String"], 11692 ) 11693 11694 if update_set_json: 11695 11696 # Update query 11697 query_update = f""" 11698 UPDATE {table_variants} 11699 SET {", ".join(update_set_json)} 11700 FROM 11701 ( 11702 SELECT 11703 "#CHROM", POS, REF, ALT, 11704 concat( 11705 '{{', 11706 string_agg( 11707 '"' || "{transcripts_column_id}" || '":' || 11708 to_json(json_output) 11709 ), 11710 '}}' 11711 )::JSON AS {transcripts_info_json} 11712 FROM 11713 ( 11714 SELECT 11715 "#CHROM", POS, REF, ALT, 11716 "{transcripts_column_id}", 11717 to_json( 11718 {{{",".join(clause_to_json)}}} 11719 )::JSON AS json_output 11720 FROM 11721 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11722 WHERE "{transcripts_column_id}" IS NOT NULL 11723 ) 11724 GROUP BY "#CHROM", POS, REF, ALT 11725 ) AS t 11726 WHERE {table_variants}."#CHROM" = t."#CHROM" 11727 AND {table_variants}."POS" = t."POS" 11728 AND {table_variants}."REF" = t."REF" 11729 AND {table_variants}."ALT" = t."ALT" 11730 """ 11731 11732 self.execute_query(query=query_update) 11733 11734 # Transcripts to info column in FORMAT 11735 if transcripts_info_format: 11736 11737 # Create column on variants table 11738 self.add_column( 11739 table_name=table_variants, 11740 column_name=transcripts_info_format, 11741 column_type="VARCHAR", 11742 default_value=None, 11743 drop=False, 11744 ) 11745 11746 # Add header 11747 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 11748 transcripts_info_format, 11749 ".", 11750 "String", 11751 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11752 "unknwon", 11753 "unknwon", 11754 self.code_type_map["String"], 11755 ) 11756 11757 # Add to update 11758 update_set_format.append( 11759 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 11760 ) 11761 11762 else: 11763 11764 # Set variable for internal queries 11765 transcripts_info_format = "transcripts_info_format" 11766 11767 # Transcripts to info field in JSON 11768 if transcripts_info_field_format: 11769 11770 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 11771 11772 # Add to update 11773 update_set_format.append( 11774 f""" 11775 INFO = concat( 11776 CASE 11777 WHEN INFO NOT IN ('', '.') 11778 THEN INFO 11779 ELSE '' 11780 END, 11781 CASE 11782 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 11783 THEN concat( 11784 ';{transcripts_info_field_format}=', 11785 t.{transcripts_info_format} 11786 ) 11787 ELSE '' 11788 END 11789 ) 11790 """ 11791 ) 11792 11793 # Add header 11794 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 11795 transcripts_info_field_format, 11796 ".", 11797 "String", 11798 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11799 "unknwon", 11800 "unknwon", 11801 self.code_type_map["String"], 11802 ) 11803 11804 if update_set_format: 11805 11806 # Update query 11807 query_update = f""" 11808 UPDATE {table_variants} 11809 SET {", ".join(update_set_format)} 11810 FROM 11811 ( 11812 SELECT 11813 "#CHROM", POS, REF, ALT, 11814 string_agg({transcripts_info_format}) AS {transcripts_info_format} 11815 FROM 11816 ( 11817 SELECT 11818 "#CHROM", POS, REF, ALT, 11819 "{transcripts_column_id}", 11820 concat( 11821 "{transcripts_column_id}", 11822 '|', 11823 {", '|', ".join(clause_to_format)} 11824 ) AS {transcripts_info_format} 11825 FROM 11826 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11827 ) 11828 GROUP BY "#CHROM", POS, REF, ALT 11829 ) AS t 11830 WHERE {table_variants}."#CHROM" = t."#CHROM" 11831 AND {table_variants}."POS" = t."POS" 11832 AND {table_variants}."REF" = t."REF" 11833 AND {table_variants}."ALT" = t."ALT" 11834 """ 11835 11836 self.execute_query(query=query_update) 11837 11838 return True
The transcript_view_to_variants function updates a variants table with information from
transcripts in JSON format.
Parameters
- transcripts_table: The
transcripts_tableparameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from theparamdictionary or use a default value of "transcripts" - transcripts_column_id: The
transcripts_column_idparameter is used to specify the column in thetranscripts_tablethat contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database - transcripts_info_json: The
transcripts_info_jsonparameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format. This parameter allows you to define the column in the variants table that will hold the JSON-formatted information about transcripts - transcripts_info_field_json: The
transcripts_info_field_jsonparameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name - transcripts_info_format: The
transcripts_info_formatparameter is used to specify the format of the information about transcripts that will be stored in the variants table. This format can be used to define how the transcript information will be structured or displayed within the variants table - transcripts_info_field_format: The
transcripts_info_field_formatparameter is used to specify the field in the VCF header that will contain information about transcripts in a specific format. This field will be added to the VCF header as an INFO field with the specified name - param: The
paramparameter in thetranscript_view_to_variantsmethod is a dictionary that contains various configuration settings related to transcripts. It is used to provide default values for certain parameters if they are not explicitly provided when calling the method. Theparamdictionary can be passed as an argument
Returns
The function
transcript_view_to_variantsreturns a boolean value. It returnsTrueif the operation is successful andFalseif certain conditions are not met.
11840 def rename_info_fields( 11841 self, fields_to_rename: dict = None, table: str = None 11842 ) -> dict: 11843 """ 11844 The `rename_info_fields` function renames specified fields in a VCF file header and updates 11845 corresponding INFO fields in the variants table. 11846 11847 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the 11848 mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary 11849 represent the original field names that need to be renamed, and the corresponding values 11850 represent the new names to which the fields should be 11851 :type fields_to_rename: dict 11852 :param table: The `table` parameter in the `rename_info_fields` function represents the name of 11853 the table in which the variants data is stored. This table contains information about genetic 11854 variants, and the function updates the corresponding INFO fields in this table when renaming 11855 specified fields in the VCF file header 11856 :type table: str 11857 :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains 11858 the original field names as keys and their corresponding new names (or None if the field was 11859 removed) as values after renaming or removing specified fields in a VCF file header and updating 11860 corresponding INFO fields in the variants table. 11861 """ 11862 11863 # Init 11864 fields_renamed = {} 11865 config = self.get_config() 11866 access = config.get("access") 11867 11868 if table is None: 11869 table = self.get_table_variants() 11870 11871 # regexp replace fonction 11872 regex_replace_dict = {} 11873 regex_replace_nb = 0 11874 regex_replace_partition = 125 11875 regex_replace = "concat(INFO, ';')" # Add ';' to reduce regexp comlexity 11876 11877 if fields_to_rename is not None and access not in ["RO"]: 11878 11879 log.info("Rename or remove fields...") 11880 11881 # Header 11882 header = self.get_header() 11883 11884 for field_to_rename, field_renamed in fields_to_rename.items(): 11885 11886 if field_to_rename in header.infos: 11887 11888 # Rename header 11889 if field_renamed is not None: 11890 header.infos[field_renamed] = vcf.parser._Info( 11891 field_renamed, 11892 header.infos[field_to_rename].num, 11893 header.infos[field_to_rename].type, 11894 header.infos[field_to_rename].desc, 11895 header.infos[field_to_rename].source, 11896 header.infos[field_to_rename].version, 11897 header.infos[field_to_rename].type_code, 11898 ) 11899 del header.infos[field_to_rename] 11900 11901 # Rename INFO patterns 11902 field_pattern = rf"(^|;)({field_to_rename})(=[^;]*)?;" 11903 if field_renamed is not None: 11904 field_renamed_pattern = rf"\1{field_renamed}\3;" 11905 else: 11906 field_renamed_pattern = r"\1" 11907 11908 # regexp replace 11909 regex_replace_nb += 1 11910 regex_replace_key = math.floor( 11911 regex_replace_nb / regex_replace_partition 11912 ) 11913 if (regex_replace_nb % regex_replace_partition) == 0: 11914 regex_replace = "concat(INFO, ';')" 11915 regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')" 11916 regex_replace_dict[regex_replace_key] = regex_replace 11917 11918 # Return 11919 fields_renamed[field_to_rename] = field_renamed 11920 11921 # Log 11922 if field_renamed is not None: 11923 log.info( 11924 f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'" 11925 ) 11926 else: 11927 log.info( 11928 f"Rename or remove fields - field '{field_to_rename}' removed" 11929 ) 11930 11931 else: 11932 11933 log.warning( 11934 f"Rename or remove fields - field '{field_to_rename}' not in header" 11935 ) 11936 11937 # Rename INFO 11938 for regex_replace_key, regex_replace in regex_replace_dict.items(): 11939 log.info( 11940 f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]..." 11941 ) 11942 query = f""" 11943 UPDATE {table} 11944 SET 11945 INFO = regexp_replace({regex_replace}, ';$', '') 11946 """ 11947 log.debug(f"query={query}") 11948 self.execute_query(query=query) 11949 11950 return fields_renamed
The rename_info_fields function renames specified fields in a VCF file header and updates
corresponding INFO fields in the variants table.
Parameters
- fields_to_rename: The
fields_to_renameparameter is a dictionary that contains the mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary represent the original field names that need to be renamed, and the corresponding values represent the new names to which the fields should be - table: The
tableparameter in therename_info_fieldsfunction represents the name of the table in which the variants data is stored. This table contains information about genetic variants, and the function updates the corresponding INFO fields in this table when renaming specified fields in the VCF file header
Returns
The
rename_info_fieldsfunction returns a dictionaryfields_renamedthat contains the original field names as keys and their corresponding new names (or None if the field was removed) as values after renaming or removing specified fields in a VCF file header and updating corresponding INFO fields in the variants table.
11952 def calculation_rename_info_fields( 11953 self, 11954 fields_to_rename: dict = None, 11955 table: str = None, 11956 operation_name: str = "RENAME_INFO_FIELDS", 11957 ) -> None: 11958 """ 11959 The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates 11960 fields to rename and table if provided, and then calls another function to rename the fields. 11961 11962 :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be 11963 renamed in a table. Each key-value pair in the dictionary represents the original field name as 11964 the key and the new field name as the value 11965 :type fields_to_rename: dict 11966 :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to 11967 specify the name of the table for which the fields are to be renamed. It is a string type 11968 parameter 11969 :type table: str 11970 :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields` 11971 method is a string that specifies the name of the operation being performed. In this context, it 11972 is used as a default value for the operation name if not explicitly provided when calling the 11973 function, defaults to RENAME_INFO_FIELDS 11974 :type operation_name: str (optional) 11975 """ 11976 11977 # Param 11978 param = self.get_param() 11979 11980 # Get param fields to rename 11981 param_fields_to_rename = ( 11982 param.get("calculation", {}) 11983 .get("calculations", {}) 11984 .get(operation_name, {}) 11985 .get("fields_to_rename", None) 11986 ) 11987 11988 # Get param table 11989 param_table = ( 11990 param.get("calculation", {}) 11991 .get("calculations", {}) 11992 .get(operation_name, {}) 11993 .get("table", None) 11994 ) 11995 11996 # Init fields_to_rename 11997 if fields_to_rename is None: 11998 fields_to_rename = param_fields_to_rename 11999 12000 # Init table 12001 if table is None: 12002 table = param_table 12003 12004 renamed_fields = self.rename_info_fields( 12005 fields_to_rename=fields_to_rename, table=table 12006 ) 12007 12008 log.debug(f"renamed_fields:{renamed_fields}")
The calculation_rename_info_fields function retrieves parameters from a dictionary, updates
fields to rename and table if provided, and then calls another function to rename the fields.
Parameters
- fields_to_rename:
fields_to_renameis a dictionary that contains the fields to be renamed in a table. Each key-value pair in the dictionary represents the original field name as the key and the new field name as the value - table: The
tableparameter in thecalculation_rename_info_fieldsmethod is used to specify the name of the table for which the fields are to be renamed. It is a string type parameter - operation_name: The
operation_nameparameter in thecalculation_rename_info_fieldsmethod is a string that specifies the name of the operation being performed. In this context, it is used as a default value for the operation name if not explicitly provided when calling the function, defaults to RENAME_INFO_FIELDS